diff --git a/atvc/README.md b/atvc/README.md index ce4a1b871c216db6d824acd28afd10d273fb965c..620e28c13366d2afc4f0cb10eedbcaa8d9ce3b78 100644 --- a/atvc/README.md +++ b/atvc/README.md @@ -72,15 +72,17 @@ Accuracy verification passed. | Reduce模板 | | Broadcast模板 | # 样例介绍 -| 样例名 | 描述 | -| ------------------------------------------------------------ | ------------------------------------------------------------ | -| [add](./examples/add/add.cpp) | 使用ATVC的Ele-wise模板实现Add算子以及调用样例 | -| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Ele-wise类算子以及调用样例 | -| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Ele-wise类算子以及调用样例 | -| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | -| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | - -更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md)。 +| 样例名 | 描述 | 类型| +| ------------------------------------------------------------ | ------------------------------------------------------------ |------------------------------------------------------------ | +| [add](./examples/add/add.cpp) | 使用ATVC的Ele-wise模板实现Add算子以及调用样例 | 直调算子 | +| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Ele-wise类算子以及调用样例 | 直调算子 | +| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Ele-wise类算子以及调用样例 | 直调算子 | +| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | 直调算子 | +| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | 直调算子 | +| [ops_aclnn](./examples/aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例 | 自定义工程算子 | +| [ops_pytorch](./examples/pytorch) | 使用ATVC基于[pytorch](https://gitee.com/ascend/pytorch)算子的实现以及调用样例 | pytorch算子 | + +更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md),其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to是ATVC的直调样例,ops_aclnn为基于ATVC对接aclnn工程的算子目录,ops_pytorch为基于ATVC对接pytorch工程的算子目录。 diff --git a/atvc/docs/2_developer_guide.md b/atvc/docs/2_developer_guide.md index 412e4ce5f3d19ca80d62dd7490023721d7ef0f94..c236bd991ca57a6d0eeb9c3dbf3ac16bb9aa81c3 100644 --- a/atvc/docs/2_developer_guide.md +++ b/atvc/docs/2_developer_guide.md @@ -86,7 +86,7 @@ EleWiseKernel<<>>(x, y, z, paramDevi 以下为Reduce算子开发场景中`ATVC::ReducePolicy`参与计算的伪代码,详细过程请参考2.2.5 Host层API: ```cpp // 声明policy和param变量 -ATVC::ReducePolicy policy; +ATVC::ReducePolicy policy = {-1, -1, -1}; ATVC::ReduceParam param; // OpTraits为算子描述原型,CalcReduceTiling API负责计算出该场景最佳数据搬运策略param以及最佳模板算子实现对应的policy ATVC::Host::CalcReduceTiling(..., &policy, ¶m); @@ -671,7 +671,7 @@ int32_t main(int32_t argc, char* argv[]) std::vector dim{0}; std::vector shape{8, 1024}; ATVC::ReduceParam param; - ATVC::ReducePolicy policy; + ATVC::ReducePolicy policy = {-1, -1, -1}; if (!ATVC::Host::CalcReduceTiling(shape, dim, ¶m, &policy)) { printf("Reduce tiling error."); return -1; @@ -727,7 +727,7 @@ using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs(inputShape, reduceDim, &policy, ¶m)) { printf("Reduce tiling error."); @@ -901,7 +901,7 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::ReducePolicy policy; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m)) { printf("Reduce tiling error."); @@ -1153,7 +1153,7 @@ int32_t main(int32_t argc, char* argv[]) std::vector shapeIn{1, 1024}; // 测试输入shape std::vector shapeOut{8, 1024}; // 测试输入shape ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 + ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 if (!ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m)) { printf("Broadcast tiling error.\n"); @@ -1200,7 +1200,7 @@ int32_t main(int32_t argc, char* argv[]) // acl资源初始化 ... ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 + ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 (void)ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m); @@ -1323,7 +1323,7 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 + ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 if (!ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m)) { printf("Broadcast tiling error.\n"); diff --git a/atvc/examples/aclnn/README.md b/atvc/examples/aclnn/README.md new file mode 100644 index 0000000000000000000000000000000000000000..35df784ff75fa94f0c7b5109539eac6065ffaae7 --- /dev/null +++ b/atvc/examples/aclnn/README.md @@ -0,0 +1,296 @@ +## 概述 +使用ATVC对接ACLNN工程简单的示例,适合初学者。 + +## 自定义算子样例说明 +样例通过Ascend C编程语言实现了ATVC框架对接自定义算子工程,并按照算子调用方式分别给出了对应的端到端实现。 + +## 算子开发样例 +| 目录名称 | 功能描述 | +| ------------------------------------------------------------ | ---------------------------------------------------- | +| [add](./add) | 基于ATVC框架的Add自定义Vector算子及AclNNInvocation调用样例 | +| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子及AclNNInvocation调用样例 | + +## 快速上手 + + 快速执行example用例,更详细的流程请参阅[add算子](../add/README.md)。 + +- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/1_quick_start.md)。 + +- 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + export ATVC_PATH=${atvc}/include + ``` + + - 执行add用例 + + ```bash + # 基于ATVC编译自定义Add算子 + $ cd ./atvc/examples/aclnn/add + # 以910B1为例,运行命令如下: + $ bash install.sh -v Ascend910B1 + # 安装custom包 + $ cd CustomOp/build_out + $ ./custom_opp*.run + # 样例运行 + $ cd ./atvc/examples/aclnn/add/AclNNInvocationNaive + $ bash run.sh + ... + test pass + ``` + +## 基于ATVC框架支持自定义算子 +### 步骤1. 生成自定义工程基础目录及文件 + 参考[msopgen](https://www.hiascend.com/document/detail/zh/mindstudio/81RC1/ODtools/Operatordevelopmenttools/atlasopdev_16_0021.html)创建算子工程的基础文件。 + ```bash + rm -rf CustomOp + # Generate the op framework + msopgen gen -i AddCustom.json -c ai_core-Ascend910B1 -lan cpp -out CustomOp + ``` + 生成目录结构如下: + ``` + CustomOp + ├── build.sh // 编译入口脚本 + ├── cmake + │ ├── config.cmake + │ ├── util // 算子工程编译所需脚本及公共编译文件存放目录 + ├── CMakeLists.txt // 算子工程的CMakeLists.txt + ├── CMakePresets.json // 编译配置项 + ├── framework // 算子插件实现文件目录,单算子模型文件的生成不依赖算子适配插件,无需关注 + ├── op_host // Host侧实现文件 + │ ├── add_custom_tiling.h // 算子tiling定义文件 + │ ├── add_custom.cpp // 算子原型注册、shape推导、信息库、tiling实现等内容文件 + │ ├── CMakeLists.txt + ├── op_kernel // Kernel侧实现文件 + │ ├── CMakeLists.txt + │ ├── add_custom.cpp // 算子代码实现文件 + ├── scripts // 自定义算子工程打包相关脚本所在目录 + ``` + +### 步骤2. 修改对应文件内容 + +- 2.1 复制需要的配置文件 + + 将[func.cmake](./add/AddCustom/cmake/func.cmake)、host侧的[CMakeLists.txt](./add/AddCustom/op_host/CMakeLists.txt)和kernel侧的[CMakeLists.txt](./add/AddCustom/op_kernel/CMakeLists.txt)分别复制到`步骤1`生成的工程文件的对应目录下。 + +- 2.2 修改对应的host文件 + - 引入对应的头文件,修改对应TilingFunc函数中tiling的生成,根据算子类型调用不同的tiling生成策略,更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。 + + elewise类,参考[add_custom.cpp](./add/AddCustom/op_host/add_custom.cpp) + ```cpp + // 引入头文件 + #include "elewise/elewise_host.h" + ... + //定义算子描述 + using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; + using AddOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + + // 修改对应TilingFunc + // 声明运行态参数tiling + ATVC::EleWiseParam *tiling = context->GetTilingData(); + uint32_t totleLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + // 根据不同数据类型使用不同的算子描述 + if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) { + // AddOpTraitsFloat为ADD算子描述原型,根据算子输入输出个数和实际元素数量计算出Tiling数据后填入tiling中 + (void)ATVC::Host::CalcEleWiseTiling(totleLength, *tiling); + } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { + (void)ATVC::Host::CalcEleWiseTiling(totleLength, *tiling); + } + // 设置tilingkey + context->SetTilingKey(0); + // 设置blockDim的大小 + context->SetBlockDim(tiling->tilingData.blockNum); + // 设置Workspace的大小 + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + ... + ``` + + broadcast类 + ```cpp + #include "broadcast/broadcast_host.h" + // 定义算子描述 + using BroadcastOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; + using BroadcastOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + ... + // 修改对应TilingFunc + // 获取输入输出shape + std::vector shapeIn; + std::vector shapeOut; + for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) { + shapeIn.push_back(inputShape0.GetDim(i)); + } + for (int32_t i = 0; i < outputShape0.GetDimNum(); i++) { + shapeOut.push_back(outputShape0.GetDim(i)); + } + // 声明运行态参数tiling + ATVC::BroadcastParam *tiling = context->GetTilingData(); + ATVC::BroadcastPolicy policy = {-1, -1, -1}; + // 根据不同数据类型使用不同的算子描述 + if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) { + // BroadcastOpTraitsFloat为Reduce算子描述原型,根据算子输入shape和dim计算出Tiling数据后填入tiling中 + (void)ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, tiling); + } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { + (void)ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, tiling); + } + // 根据不同的policy设置不同的tilingkey,在kernel侧根据不同的tilingkey进行调用不同的算子模版 + if (policy == ATVC::BROADCAST_POLICY0) { + context->SetTilingKey(0); + } else if (policy == ATVC::BROADCAST_POLICY1) { + context->SetTilingKey(1); + } + // 设置blockDim + context->SetBlockDim(tiling->tilingData.coreNum); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + ``` + + reduce_sum类,参考[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp) + ```cpp + // 引入头文件 + #include "reduce/reduce_host.h" + // 定义算子描述 + using ReduceOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; + using ReduceOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + ... + // 修改对应TilingFunc + ATVC::ReducePolicy policy = {0, 0, 0}; + auto inputShape0 = context->GetInputShape(0)->GetOriginShape(); + std::vector shapeIn; + for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) { + shapeIn.push_back(inputShape0.GetDim(i)); + } + // 获取dim值 + const gert::RuntimeAttrs *runtimeAttrs = context->GetAttrs(); + const gert::TypedContinuousVector *attr0 = runtimeAttrs->GetListInt(0); + const int64_t *arr = reinterpret_cast(attr0->GetData()); + std::vector dim(arr, arr + attr0->GetSize()); + + // 声明运行态参数tiling + ATVC::ReduceParam *tiling = context->GetTilingData(); + // 根据不同数据类型使用不同的算子描述 + if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) { + // ReduceOpTraitsFloat为Reduce算子描述原型,根据算子输入shape和dim计算出Tiling数据后填入tiling中 + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, tiling); + } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, tiling); + } + // 设置policyId,作为kernel的分支判断 + tiling->policyId = policy.getID(); + // 设置blockDim + context->SetBlockDim(tiling->tilingData.coreNum); + ``` + - 2.3 修改对应的kernel文件 + + 用户需要通过AscendC API来搭建Add算子的核心计算逻辑,在ATVC框架中,这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数,并重载`operator()`来被提供的Kernel层算子模板类调用,更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。 + + elewise类[add_custom.cpp](./add/AddCustom/op_kernel/add_custom.cpp) + ```cpp + // 头文件引入 + #include "elewise/elewise_device.h" + ... + // 定义算子描述 + using AddOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + + ... + // 新增 AddComputeFunc + // 传入编译态参数ATVC::OpTraits + template + struct AddComputeFunc { + /* + 函数说明: z = x + y + 参数说明: + x : 参与运算的输入 + y : 参与运算的输入 + z : 参与运算的输出 + */ + template + // 重载operator,提供给算子模板类调用 + __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { + AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量 + } + }; + + // 修改核函数文件的实现 + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + REGISTER_TILING_DEFAULT(ATVC::EleWiseParam); + auto op = ATVC::Kernel::EleWiseOpTemplate>(); + op.Run(x, y, z, tiling); + ``` + broadcast类 + ```cpp + // 头文件引入 + #include "broadcast/broadcast_device.h" + // 定义算子描述 + using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + ... + + // 修改核函数文件 + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + REGISTER_TILING_DEFAULT(ATVC::BroadcastParam); + // broadcast有不同的policy,在host与tilingkey进行绑定,此处的调用使用TILING_KEY_IS进行判断,和host的文件SetTilingKey相对应 + if (TILING_KEY_IS(0)) { + auto op = ATVC::Kernel::BroadcastOpTemplate, ATVC::BROADCAST_POLICY0>(); + op.Run(x, y, tiling); + }else{ + ... + } + ``` + reduce_sum类[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp) + ```cpp + // 头文件引入 + #include "reduce/reduce_device.h" + // 定义算子描述 + using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + ... + + // 修改核函数文件 + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); + REGISTER_TILING_DEFAULT(ATVC::ReduceParam); + GET_TILING_DATA(param, tiling); + if (param.policyId == ATVC::REDUCE_POLICY0.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY0>(); + op.Run(x, y, tiling); + } else { + // 根据不同的tiling.policyId进行判断不同ReduceOpTemplate初始化 + ... + } + ``` + 此处未使用`TILING_KEY_IS`进行分支判断,因为policy分支过多,在使用`tilingKey`进行判断的时候,会有爆栈的问题,此时建议使用`param.policyId`进行分支的判断。 + +### 步骤3. 算子工程编译 + + 在算子工程目录下执行如下命令,进行算子工程编译。 + ```bash + $ cd CustomOp + $ bash build.sh + ``` +脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp__.run,例如“custom_opp_ubuntu_x86_64.run”。 + +### 步骤4. 部署自定义算子包 + + 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` +参数说明: + + ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 +在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 +命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + + 执行如下命令,安装自定义算子包。 + ```bash + cd build_out + ./custom_opp__.run + ``` + +### 步骤5. 调用执行算子工程 + + 算子文件编写完成,参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证 \ No newline at end of file diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt b/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..57a2c66376526b6a80f9d407e3f4e13d78022104 --- /dev/null +++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt @@ -0,0 +1,60 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_add) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_add_op + main.cpp +) + +target_link_libraries(execute_add_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md b/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e826fd573f773eb04fa6ade96590eadc9c750501 --- /dev/null +++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md @@ -0,0 +1,52 @@ +## 概述 +本样例相比于AclNNInvocation样例工程,简化了工程配置。 +## 目录结构介绍 +``` +├── AclNNInvocationNaive +│ ├── CMakeLists.txt // 编译规则文件 +│ ├── main.cpp // 单算子调用应用的入口 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + ```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddCustomGetWorkspaceSize(const aclTensor *x, const aclTensor *y, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream); + ``` +其中`aclnnAddCustomGetWorkspaceSize`为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的`workspace`大小之后,开发者按照workspaceSize大小申请Device侧内存,然后调用第二段接口`aclnnAddCustom`执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 +## 运行样例算子 +### 1. 编译算子工程 +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 +### 2. aclnn调用样例运行 + + - 进入到样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd atvc/examples/aclnn/add/AclNNInvocationNaive + ``` + - 样例编译文件修改 + + 将CMakeLists.txt文件内"/usr/local/Ascend/ascend-toolkit/latest"替换为CANN软件包安装后的实际路径。 + eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + + - 环境变量配置 + + 需要设置NPU_HOST_LIB环境变量,以x86为例 + ```bash + export NPU_HOST_LIB=/home/HwHiAiUser/Ascend/ascend-toolkit/latest/x86_64-linux/lib64 + ``` + - 样例执行 + + 用户参考run.sh脚本进行编译与运行。 + ```bash + bash run.sh + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/22 | 新增本readme | \ No newline at end of file diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp b/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..694a61ce29a8f7997123c84f2b93127240a1eb7b --- /dev/null +++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp @@ -0,0 +1,199 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_add_custom.h" +namespace { +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ + } while (0) + +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ + } while (0) + +int64_t GetShapeSize(const std::vector &shape) +{ + int64_t shapeSize = 1; + for (auto i : shape) { + shapeSize *= i; + } + return shapeSize; +} + +int Init(int32_t deviceId, aclrtStream *stream) +{ + // Fixed code, acl initialization + auto ret = aclInit(nullptr); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return 1); + ret = aclrtSetDevice(deviceId); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return 1); + ret = aclrtCreateStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return 1); + + return 0; +} + +template +int CreateAclTensor(const std::vector &hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + auto size = GetShapeSize(shape) * sizeof(T); + // Call aclrtMalloc to allocate device memory + auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return 1); + + // Call aclrtMemcpy to copy host data to device memory + ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return 1); + + // Call aclCreateTensor to create a aclTensor object + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(), + shape.size(), *deviceAddr); + return 0; +} + +void DestroyResources(std::vector tensors, std::vector deviceAddrs, aclrtStream stream, + int32_t deviceId, void *workspaceAddr = nullptr) +{ + // Release aclTensor and device + for (uint32_t i = 0; i < tensors.size(); i++) { + if (tensors[i] != nullptr) { + aclDestroyTensor(reinterpret_cast(tensors[i])); + } + if (deviceAddrs[i] != nullptr) { + aclrtFree(deviceAddrs[i]); + } + } + if (workspaceAddr != nullptr) { + aclrtFree(workspaceAddr); + } + // Destroy stream and reset device + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); +} + +void InitializeData(std::vector &inputX, std::vector &inputY, std::vector &inputZ, + std::vector &golden, std::vector &shape) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(1.0f, 100.0f); + + for (int i = 0; i < shape[0] * shape[1]; ++i) { + inputX[i] = dis(gen); + inputY[i] = dis(gen); + golden[i] = inputX[i] + inputY[i]; + inputZ[i] = 0.0; + } +} + +bool VerifyResults(const std::vector &goldenData, const std::vector &resultData) +{ + int64_t len = 10; + LOG_PRINT("result is:\n"); + for (int64_t i = 0; i < len; i++) { + LOG_PRINT("%.1f ", resultData[i]); + } + LOG_PRINT("\n"); + if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) { + LOG_PRINT("test pass\n"); + } else { + LOG_PRINT("test failed\n"); + return false; + } + return true; +} +} + +int main(int argc, char **argv) +{ + // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl + // Update deviceId to your own device id + int32_t deviceId = 0; + aclrtStream stream; + auto ret = Init(deviceId, &stream); + CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return 1); + + // 2. Create input and output, need to customize according to the interface of the API + std::vector shape = {8, 2048}; + void *inputXDeviceAddr = nullptr; + void *inputYDeviceAddr = nullptr; + void *outputZDeviceAddr = nullptr; + aclTensor *inputX = nullptr; + aclTensor *inputY = nullptr; + aclTensor *outputZ = nullptr; + std::vector inputXHostData(shape[0] * shape[1]); + std::vector inputYHostData(shape[0] * shape[1]); + std::vector outputZHostData(shape[0] * shape[1]); + std::vector goldenData(shape[0] * shape[1]); + + InitializeData(inputXHostData, inputYHostData, outputZHostData, goldenData, shape); + std::vector tensors = {inputX, inputY, outputZ}; + std::vector deviceAddrs = {inputXDeviceAddr, inputYDeviceAddr, outputZDeviceAddr}; + // Create inputX aclTensor + ret = CreateAclTensor(inputXHostData, shape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + // Create inputY aclTensor + ret = CreateAclTensor(inputYHostData, shape, &inputYDeviceAddr, aclDataType::ACL_FLOAT, &inputY); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + // Create outputZ aclTensor + ret = CreateAclTensor(outputZHostData, shape, &outputZDeviceAddr, aclDataType::ACL_FLOAT, &outputZ); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + + // 3. Call the API of the custom operator library + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + // Calculate the workspace size and allocate memory for it + ret = aclnnAddCustomGetWorkspaceSize(inputX, inputY, outputZ, &workspaceSize, &executor); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddCustomGetWorkspaceSize failed. ERROR: %d\n", ret); + DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + + void *workspaceAddr = nullptr; + if (workspaceSize > 0U) { + ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + } + // Execute the custom operator + ret = aclnnAddCustom(workspaceAddr, workspaceSize, executor, stream); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 4. (Fixed code) Synchronize and wait for the task to complete + ret = aclrtSynchronizeStream(stream); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the + // interface of the API + auto size = GetShapeSize(shape); + std::vector resultData(size, 0); + ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr, + size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 6. Detroy resources, need to modify according to the interface of the API + DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); + + // print the output result + if (!VerifyResults(goldenData, resultData)) { + return -1; + } + return 0; +} diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh b/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..7abad744e9245564546a2b97d8b1bba68b659dd5 --- /dev/null +++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh @@ -0,0 +1,26 @@ +#!/bin/bash +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +set -e +rm -rf build +mkdir -p build +cmake -B build -DCMAKE_SKIP_RPATH=TRUE +cmake --build build -j +( + cd build + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + ./execute_add_op +) diff --git a/atvc/examples/aclnn/add/AddCustom.json b/atvc/examples/aclnn/add/AddCustom.json new file mode 100644 index 0000000000000000000000000000000000000000..f55a31b9bb3761f2b2a4bc7e8c152aac6e1511b6 --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom.json @@ -0,0 +1,46 @@ +[ + { + "op": "AddCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND", + "ND" + ], + "type": [ + "float32", + "int32" + ] + }, + { + "name": "y", + "param_type": "required", + "format": [ + "ND", + "ND" + ], + "type": [ + "float32", + "int32" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND", + "ND" + ], + "type": [ + "float32", + "int32" + ] + } + ] + } +] \ No newline at end of file diff --git a/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e1f43e5a2ed42c2ee3395308378e61eb15606f --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt @@ -0,0 +1,76 @@ +cmake_minimum_required(VERSION 3.16.0) +project(opp) + +include(cmake/config.cmake) +include(cmake/func.cmake) +include(cmake/intf.cmake) + +if(ENABLE_CROSS_COMPILE) + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64) + set(CROSS_COMPILE_PLATFORM aarch64) + else() + set(CROSS_COMPILE_PLATFORM x86_64) + endif() + set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR}) + set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/) + set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/) + if(CMAKE_CROSS_LIBRARY_PATH) + set(CMAKE_COMPILE_COMPILER_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH}) + set(CMAKE_COMPILE_RUNTIME_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH}) + endif() + set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM}) + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER}) +else() + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) + add_subdirectory(framework) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) + add_subdirectory(op_host) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) + add_subdirectory(op_kernel) +endif() +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# modify vendor_name in install.sh and upgrade.sh +add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts + COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/ + COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/* +) +add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh) + +get_system_info(SYSTEM_INFO) + +# gen version.info +add_custom_target(gen_version_info ALL + COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR} +) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ) + + install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info + DESTINATION packages/vendors/${vendor_name}/) + + # CPack config + set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) + set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) + set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") + set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) + set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") + set(CPACK_GENERATOR External) + set(CPACK_CMAKE_GENERATOR "Unix Makefiles") + set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) + set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) + set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}) + include(CPack) +endif() diff --git a/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake b/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e234f442aedc54b40f2d557d5ead27d7e8d880f5 --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake @@ -0,0 +1,377 @@ +include(ExternalProject) +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() +function(get_system_info SYSTEM_INFO) + if (UNIX) + execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP) + string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP}) + set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE) + elseif (WIN32) + message(STATUS "System is Windows. Only for pre-build.") + else () + message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.") + endif () +endfunction() + +function(opbuild) + message(STATUS "Opbuild generating sources") + cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX;ENABLE_SOURCE" "OPS_SRC" ${ARGN}) + execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++17 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0 + -I ${ASCEND_CANN_PACKAGE_PATH}/include -I ${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel + -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api -I ${ATVC_PATH} + -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message("build ops lib info: ${EXEC_INFO}") + message("build ops lib error: ${EXEC_ERROR}") + message(FATAL_ERROR "opbuild run failed!") + endif() + set(proj_env "") + set(prefix_env "") + if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x") + set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}") + endif() + if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x") + set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}") + endif() + + set(ENV{ENABLE_SOURCE_PACAKGE} ${OPBUILD_ENABLE_SOURCE}) + if(${ASCEND_PACK_SHARED_LIBRARY}) + if (NOT vendor_name) + message(FATAL_ERROR "ERROR: vendor_name is invalid!") + return() + endif() + set(ENV{ASCEND_VENDOR_NAME} ${vendor_name}) + set(ENV{OPS_PRODUCT_NAME} ${ASCEND_COMPUTE_UNIT}) + set(ENV{SYSTEM_PROCESSOR} ${CMAKE_SYSTEM_PROCESSOR}) + endif() + execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build + ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + unset(ENV{ENABLE_SOURCE_PACAKGE}) + if(${ASCEND_PACK_SHARED_LIBRARY}) + unset(ENV{ASCEND_VENDOR_NAME}) + unset(ENV{OPS_PRODUCT_NAME}) + unset(ENV{SYSTEM_PROCESSOR}) + endif() + if (${EXEC_RESULT}) + message("opbuild ops info: ${EXEC_INFO}") + message("opbuild ops error: ${EXEC_ERROR}") + endif() + message(STATUS "Opbuild generating sources - done") +endfunction() + +function(add_ops_info_target) + cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN}) + get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY) + add_custom_command(OUTPUT ${OPINFO_OUTPUT} + COMMAND mkdir -p ${opinfo_file_path} + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py + ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT} + ) + add_custom_target(${OPINFO_TARGET} ALL + DEPENDS ${OPINFO_OUTPUT} + ) + if(NOT ${ASCEND_PACK_SHARED_LIBRARY}) + install(FILES ${OPINFO_OUTPUT} + DESTINATION ${OPINFO_INSTALL_DIR} + ) + endif() +endfunction() + +function(add_ops_compile_options OP_TYPE) + cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN}) + execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_gen_options.py + ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} ${OP_TYPE} ${OP_COMPILE_COMPUTE_UNIT} + ${OP_COMPILE_OPTIONS} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR) + if (${EXEC_RESULT}) + message("add ops compile options info: ${EXEC_INFO}") + message("add ops compile options error: ${EXEC_ERROR}") + message(FATAL_ERROR "add ops compile options failed!") + endif() +endfunction() + +function(add_npu_support_target) + cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN}) + get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY) + add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json + COMMAND mkdir -p ${NPUSUP_OUT_DIR} + COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh + ${NPUSUP_OPS_INFO_DIR} + ${NPUSUP_OUT_DIR} + ) + add_custom_target(npu_supported_ops ALL + DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json + ) + if(NOT ${ASCEND_PACK_SHARED_LIBRARY}) + install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json + DESTINATION ${NPUSUP_INSTALL_DIR} + ) + endif() +endfunction() + +function(add_simple_kernel_compile) + set(options "") + set(single_value_args "OPS_INFO;OUT_DIR;TILING_LIB;OP_TYPE;SRC;COMPUTE_UNIT;JSON_FILE;DYNAMIC_PATH") + set(multi_value_args "OPTIONS;CONFIGS") + cmake_parse_arguments(BINCMP "${options}" "${single_value_args}" "${multi_value_args}" ${ARGN}) + if (NOT DEFINED BINCMP_OUT_DIR) + set(BINCMP_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary) + endif() + if (NOT DEFINED BINCMP_TILING_LIB) + set(BINCMP_TILING_LIB $) + endif() + if (${ASCEND_PACK_SHARED_LIBRARY}) + if (NOT TARGET op_kernel_pack) + add_custom_target(op_kernel_pack + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_kernel.py + --input-path=${BINCMP_OUT_DIR} + --output-path=${BINCMP_OUT_DIR}/library + --enable-library=${ASCEND_PACK_SHARED_LIBRARY} + --platform=${CMAKE_SYSTEM_PROCESSOR}) + add_library(ascend_kernels INTERFACE) + target_link_libraries(ascend_kernels INTERFACE kernels) + target_link_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library) + target_include_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library) + add_dependencies(ascend_kernels op_kernel_pack) + add_dependencies(op_kernel_pack ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}) + endif() + endif() + # add Environment Variable Configurations of ccache + set(_ASCENDC_ENV_VAR) + if(${CMAKE_CXX_COMPILER_LAUNCHER} MATCHES "ccache$") + list(APPEND _ASCENDC_ENV_VAR export ASCENDC_CCACHE_EXECUTABLE=${CMAKE_CXX_COMPILER_LAUNCHER} &&) + endif() + + if (NOT DEFINED BINCMP_OPS_INFO) + set(BINCMP_OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${BINCMP_COMPUTE_UNIT}-ops-info.ini) + endif() + if (NOT ${ENABLE_CROSS_COMPILE}) + add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} + COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py + --op-name=${BINCMP_OP_TYPE} + --src-file=${BINCMP_SRC} + --compute-unit=${BINCMP_COMPUTE_UNIT} + --compile-options=\"${BINCMP_OPTIONS}\" + --debug-config=\"${BINCMP_CONFIGS}\" + --config-ini=${BINCMP_OPS_INFO} + --tiling-lib=${BINCMP_TILING_LIB} + --output-path=${BINCMP_OUT_DIR} + --dynamic-dir=${BINCMP_DYNAMIC_PATH} + --enable-binary=\"${ENABLE_BINARY_PACKAGE}\" + --json-file=${BINCMP_JSON_FILE} + --build-tool=$(MAKE)) + add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} cust_optiling) + else() + if (${ENABLE_BINARY_PACKAGE} AND NOT DEFINED HOST_NATIVE_TILING_LIB) + message(FATAL_ERROR "Native host libs was not set for cross compile!") + endif() + add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} + COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py + --op-name=${BINCMP_OP_TYPE} + --src-file=${BINCMP_SRC} + --compute-unit=${BINCMP_COMPUTE_UNIT} + --compile-options=\"${BINCMP_OPTIONS}\" + --debug-config=\"${BINCMP_CONFIGS}\" + --config-ini=${BINCMP_OPS_INFO} + --tiling-lib=${HOST_NATIVE_TILING_LIB} + --output-path=${BINCMP_OUT_DIR} + --dynamic-dir=${BINCMP_DYNAMIC_PATH} + --enable-binary=\"${ENABLE_BINARY_PACKAGE}\" + --json-file=${BINCMP_JSON_FILE} + --build-tool=$(MAKE)) + endif() + add_dependencies(ascendc_bin_${BINCMP_COMPUTE_UNIT}_gen_ops_config ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}) + add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} ops_info_gen_${BINCMP_COMPUTE_UNIT}) +endfunction() + +function(ascendc_device_library) + message(STATUS "Ascendc device library generating") + cmake_parse_arguments(DEVICE "" "TARGET;OPTION" "SRC" ${ARGN}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt + ) + execute_process( + COMMAND ${CMAKE_COMMAND} -E echo "cmake_minimum_required(VERSION 3.16.0)\nproject(cust_tiling_sink)\ninclude(${CMAKE_SOURCE_DIR}/cmake/device_task.cmake)\n" + OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt + RESULT_VARIABLE result + ) + string(REPLACE ";" " " DEVICE_SRC "${DEVICE_SRC}") + ExternalProject_Add(tiling_sink_task + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink + CONFIGURE_COMMAND ${CMAKE_COMMAND} + -DASCEND_CANN_PACKAGE_PATH=${ASCEND_CANN_PACKAGE_PATH} + -DTARGET=${DEVICE_TARGET} + -DOPTION=${DEVICE_OPTION} + -DSRC=${DEVICE_SRC} + -DVENDOR_NAME=${vendor_name} + + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + INSTALL_COMMAND "" + BUILD_ALWAYS TRUE + ) + ExternalProject_Get_Property(tiling_sink_task BINARY_DIR) + set(TILINGSINK_LIB_PATH "") + if ("${DEVICE_OPTION}" STREQUAL "SHARED") + set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.so") + else() + set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.a") + endif() + install(FILES ${TILINGSINK_LIB_PATH} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_master_device/lib + ) +endfunction() +function(add_opregistry_target) + string(REPLACE ";" "-" COMPUTE_UNIT "${ASCEND_COMPUTE_UNIT}") + add_custom_target(op_registry_pack + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_opregistry.py + --input-path=${CMAKE_SOURCE_DIR}/build_out/ + --base-path=${CMAKE_SOURCE_DIR}/build_out/tmp/vendors/ + --output-path=${CMAKE_SOURCE_DIR}/build_out/library/ + --vendor-name=${vendor_name} + --compute-unit=${COMPUTE_UNIT} + --framework-type=${ASCEND_FRAMEWORK_TYPE} + --platform=${CMAKE_SYSTEM_PROCESSOR}) + add_library(ascend_opregistry INTERFACE) + target_link_libraries(ascend_opregistry INTERFACE opregistry) + target_link_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library) + target_include_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library) + add_dependencies(ascend_opregistry op_registry_pack) + if(EXISTS "${CMAKE_SOURCE_DIR}/framework/caffe_plugin") + add_dependencies(op_registry_pack cust_caffe_parsers) + elseif(EXISTS "${CMAKE_SOURCE_DIR}/framework/tf_plugin") + add_dependencies(op_registry_pack cust_tf_parsers) + elseif(EXISTS "${CMAKE_SOURCE_DIR}/framework/onnx_plugin") + add_dependencies(op_registry_pack cust_onnx_parsers) + endif() +endfunction() + +function(add_kernels_install) + # install kernel file + if (${ENABLE_SOURCE_PACKAGE}) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic/ + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic/ + ) + endif() + + # install *.o files and *.json files + if (${ENABLE_BINARY_PACKAGE}) + set(INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/) + foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/ + DESTINATION ${INSTALL_DIR}/kernel/${compute_unit}/ + ) + endforeach() + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/config/ + DESTINATION ${INSTALL_DIR}/kernel/config/ + ) + endif() +endfunction() + +function(add_kernels_compile) + set(DYNAMIC_PATH "") + if (${ENABLE_SOURCE_PACKAGE}) + set(DYNAMIC_PATH ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic) + execute_process(COMMAND sh -c "mkdir -p ${DYNAMIC_PATH} && + cp -rf ${CMAKE_SOURCE_DIR}/op_kernel/* ${DYNAMIC_PATH}/ && + rm ${DYNAMIC_PATH}/CMakeLists.txt" + RESULT_VARIABLE EXEC_RESULT + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message(FATAL_ERROR, "copy_source_files failed, gen error:${EXEC_ERROR}" ) + endif() + endif() + + foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # define a target:binary to prevent kernel file from being rebuilt during the preinstall process + if (NOT TARGET binary) + add_custom_target(binary) + endif() + + if (${ENABLE_BINARY_PACKAGE} OR ${ENABLE_SOURCE_PACKAGE}) + if (${ENABLE_BINARY_PACKAGE}) + # gen binary_info_config.json and .json + add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py + -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py + -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + -s ${compute_unit} + COMMAND ${CMAKE_COMMAND} -E make_directory + ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit} + COMMAND mv ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/*.json + ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit} + ) + else() + if (NOT TARGET ascendc_bin_${compute_unit}_gen_ops_config) + add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config) + endif() + endif() + add_dependencies(binary ascendc_bin_${compute_unit}_gen_ops_config) + + # get op_type-op_name from aic-${compute_unit}-ops-info.ini + execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_get_op_name.py + --ini-file=${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + OUTPUT_VARIABLE OP_TYPE_NAME + RESULT_VARIABLE EXEC_RESULT + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message(FATAL_ERROR, "get op name failed, gen error: ${EXEC_ERROR}") + endif() + + # compile op one by one with ascendc_compile_kernel.py + string(REPLACE "\n" ";" TYPE_NAME_LIST "${OP_TYPE_NAME}") + foreach(TYPE_NAME IN LISTS TYPE_NAME_LIST) + if (NOT "${TYPE_NAME}" STREQUAL "") + string(REPLACE "-" ";" bin_sep ${TYPE_NAME}) + list(GET bin_sep 0 op_type) + list(GET bin_sep 1 op_file) + add_simple_kernel_compile(OP_TYPE ${op_type} + SRC ${CMAKE_SOURCE_DIR}/op_kernel/${op_file}.cpp + COMPUTE_UNIT ${compute_unit} + JSON_FILE ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + DYNAMIC_PATH ${DYNAMIC_PATH}) + endif() + endforeach() + endif() + endforeach() + + # generate npu_supported_ops.json + add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} + ) + + if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) + endif() + + if(NOT ASCEND_PACK_SHARED_LIBRARY) + add_kernels_install() + else() + add_opregistry_target() + endif() +endfunction() \ No newline at end of file diff --git a/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a6279ff469e98190072e341d2058941f197ccbf --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt @@ -0,0 +1,183 @@ +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +file(GLOB group_proto_src ${ASCEND_AUTOGEN_PATH}/group_proto/*.cc) + +add_library(cust_op_proto SHARED + $<$:${group_proto_src}> + ${ops_srcs} + ${ASCEND_AUTOGEN_PATH}/op_proto.cc +) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_include_directories(cust_op_proto PRIVATE ${ATVC_PATH}) +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +file(GLOB fallback_src ${ASCEND_AUTOGEN_PATH}/fallback_*.cpp) +add_library(cust_optiling SHARED ${ops_srcs}) +if (${fallback_src}) + target_sources(cust_optiling PRIVATE ${fallback_src}) +endif() +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +target_include_directories(cust_optiling PRIVATE + ${ATVC_PATH} +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + nnopbase + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + add_library(cust_opapi SHARED ${aclnn_src}) +else() + file(GLOB op_registry ${ASCEND_AUTOGEN_PATH}/custom_op_registry.cpp) + add_library(cust_opapi SHARED ${aclnn_src} ${op_registry}) + target_compile_definitions(cust_opapi PRIVATE ACLNN_WITH_BINARY) +endif() + +target_include_directories(cust_opapi PRIVATE + ${ATVC_PATH} +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +if(NOT ASCEND_PACK_SHARED_LIBRARY) + target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) +else() + add_library(cust_op_proto_obj OBJECT + $<$:${group_proto_src}> + ${ops_srcs} + ${ASCEND_AUTOGEN_PATH}/op_proto.cc + ) + target_compile_definitions(cust_op_proto_obj PRIVATE OP_PROTO_LIB) + target_compile_options(cust_op_proto_obj PRIVATE + -fvisibility=hidden + ) + if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto_obj PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) + endif() + target_link_libraries(cust_op_proto_obj PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive + ) + add_library(cust_optiling_obj OBJECT ${ops_srcs}) + target_compile_definitions(cust_optiling_obj PRIVATE OP_TILING_LIB) + target_compile_options(cust_optiling_obj PRIVATE + -fvisibility=hidden + ) + target_include_directories(cust_optiling_obj PRIVATE ${ATVC_PATH}) + if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling_obj PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) + endif() + + target_link_libraries(cust_optiling_obj PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive + ) + target_compile_options(cust_opapi PRIVATE -DLOG_CPP) + target_include_directories(cust_opapi INTERFACE ${ATVC_PATH}) + target_include_directories(cust_opapi INTERFACE ${CMAKE_SOURCE_DIR}/build_out/binary/) + target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase cust_optiling_obj cust_op_proto_obj ascend_opregistry ascend_kernels) + add_dependencies(cust_opapi ascend_opregistry) +endif() + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) + install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) + file(GLOB GROUP_PROTO_HEADERS ${ASCEND_AUTOGEN_PATH}/group_proto/*.h) + if (GROUP_PROTO_HEADERS) + install(FILES ${GROUP_PROTO_HEADERS} + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) + endif() + install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) + install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) + install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) +else() + file(GLOB group_inc ${ASCEND_AUTOGEN_PATH}/group_proto/*.h) + install(TARGETS cust_opapi + LIBRARY DESTINATION op_api/lib) + install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION op_api/include) + install(FILES ${group_inc} + DESTINATION op_api/include) + install(FILES ${aclnn_inc} + DESTINATION op_api/include) +endif() diff --git a/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp b/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..da7c87ba15964e262435e3a7601d10819224e191 --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp @@ -0,0 +1,81 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "elewise/elewise_host.h" +#include "register/op_def_registry.h" + +using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; +using AddOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + // 声明运行态参数tiling + ATVC::EleWiseParam *tiling = context->GetTilingData(); + uint32_t totleLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) { + // AddOpTraitsFloat为ADD算子描述原型,根据算子输入输出个数和实际元素数量计算出Tiling数据后填入tiling中 + (void)ATVC::Host::CalcEleWiseTiling(totleLength, *tiling); + } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { + (void)ATVC::Host::CalcEleWiseTiling(totleLength, *tiling); + } + // 设置tilingkey + context->SetTilingKey(0); + // 设置blockDim的大小 + context->SetBlockDim(tiling->tilingData.blockNum); + // 设置Workspace的大小 + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext *context) +{ + const gert::Shape *x1_shape = context->GetInputShape(0); + gert::Shape *y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static graphStatus InferDataType(gert::InferDataTypeContext *context) +{ + const auto inputDataType = context->GetInputDataType(0); + context->SetOutputDataType(0, inputDataType); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Input("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("z") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..16f4f8e4aa572b35d908cb42f2c4ffbdb91b2c5d --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_VERSION_MAKEFILE ON) +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +add_ops_compile_options(ALL OPTIONS -g -O0 --cce-aicore-block-local-init -I ${ATVC_PATH}) +add_kernels_compile() \ No newline at end of file diff --git a/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp b/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d28d2c6c93b669480b2f0570b9301cd57a9105fa --- /dev/null +++ b/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp @@ -0,0 +1,31 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "elewise/elewise_device.h" + +using AddOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +template +struct AddComputeFunc { + template + __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) + { + AscendC::Add(z, x, y, z.GetSize()); + } +}; + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + REGISTER_TILING_DEFAULT(ATVC::EleWiseParam); + auto op = ATVC::Kernel::EleWiseOpTemplate>(); + op.Run(x, y, z, tiling); +} diff --git a/atvc/examples/aclnn/add/README.md b/atvc/examples/aclnn/add/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5eb359d02f3ee5c9e50a6c5f3a582b3f880af058 --- /dev/null +++ b/atvc/examples/aclnn/add/README.md @@ -0,0 +1,109 @@ +## 概述 +本样例基于AddCustom算子工程,介绍了基于ATVC的单算子工程、单算子调用。 + +## 目录结构介绍 +``` +├── add +│ ├── AclNNInvocationNaive // 通过aclnn调用的方式调用AddCustom算子 +│ ├── AddCustom // AddCustom算子工程 +│ ├── AddCustom.json // AddCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048float,intND
y8 * 2048float,intND
算子输出z8 * 2048float,intND
核函数名add_custom
+ +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品 + +## 算子工程介绍 +其中,算子工程目录AddCustom包含算子的实现文件,如下所示: +``` +├── AddCustom // AddCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ ├── op_kernel // kernel侧实现文件 +│ ├── build.sh // 算子构建入口 +│ └── CMakeLists.txt // 算子的cmake文件 +``` +CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通过AddCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 +针对自定义算子工程,编译运行包含如下步骤: +- 调用msOpGen工具生成自定义算子工程; +- 基于ATVC框架完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 +### 1. 获取源码包及环境配置 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + - 切换到msOpGen脚本install.sh所在目录 + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd atvc/examples/aclnn/add + ``` + + - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + 运行install.sh脚本 + ```bash + # 以910B1为例,运行命令如下: + bash install.sh -v [SOC_VERSION] + ``` + 参数说明: + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + +### 3. 部署自定义算子包 +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + 参数说明: + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 + +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 +- [aclnn调用AddCustom算子工程](./AclNNInvocationNaive/README.md) + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ---------------------------- | +| 2025/07/24 | 新增readme | diff --git a/atvc/examples/aclnn/add/install.sh b/atvc/examples/aclnn/add/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..5ca57e7ebfea0ed405ccffa131a6c42ea94a5fe2 --- /dev/null +++ b/atvc/examples/aclnn/add/install.sh @@ -0,0 +1,57 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=AddCustom +rm -rf CustomOp +# Generate the op framework +msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +cp -rf $OP_NAME/* CustomOp +# Delete tiling.h +rm -rf CustomOp/op_host/*._tiling.h +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a92ffe7d9af319c4ebfb69925293080cc9ac2a1 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt @@ -0,0 +1,60 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_reduce_sum) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_reduce_sum_op + main.cpp +) + +target_link_libraries(execute_reduce_sum_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_reduce_sum_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4bc6770ff8e0658d91afb7106907e6c2633c8b31 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md @@ -0,0 +1,52 @@ +## 概述 +本样例相比于AclNNInvocation样例工程,简化了工程配置。 +## 目录结构介绍 +``` +├── AclNNInvocationNaive +│ ├── CMakeLists.txt // 编译规则文件 +│ ├── main.cpp // 单算子调用应用的入口 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + ```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnReduceSumCustomGetWorkspaceSize(const aclTensor *x, const aclIntArrat *dim, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnReduceSumCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream); + ``` +其中`aclnnReduceSumCustomGetWorkspaceSize`为第一段接口,主要用于计算本次API调用计算过程中需要多少的`workspace`内存。获取到本次API计算需要的`workspace`大小之后,开发者按照`workspaceSize`大小申请Device侧内存,然后调用第二段接口`aclnnReduceSumCustom`执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 +## 运行样例算子 +### 1. 编译算子工程 +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 +### 2. aclnn调用样例运行 + + - 进入到样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive + ``` + - 样例编译文件修改 + + 将CMakeLists.txt文件内"/usr/local/Ascend/ascend-toolkit/latest"替换为CANN软件包安装后的实际路径。 + eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + + - 环境变量配置 + + 需要设置NPU_HOST_LIB环境变量,以x86为例 + ```bash + export NPU_HOST_LIB=/home/HwHiAiUser/Ascend/ascend-toolkit/latest/x86_64-linux/lib64 + ``` + - 样例执行 + + 用户参考run.sh脚本进行编译与运行。 + ```bash + bash run.sh + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/22 | 新增本readme | \ No newline at end of file diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f748e7fab273fa66581989d959e7b8f725d73494 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp @@ -0,0 +1,195 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_reduce_sum_custom.h" + +namespace { +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ + } while (0) + +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ + } while (0) + +int64_t GetShapeSize(const std::vector &shape) +{ + int64_t shapeSize = 1; + for (auto i : shape) { + shapeSize *= i; + } + return shapeSize; +} + +int Init(int32_t deviceId, aclrtStream *stream) +{ + // Fixed code, acl initialization + auto ret = aclInit(nullptr); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return 1); + ret = aclrtSetDevice(deviceId); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return 1); + ret = aclrtCreateStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return 1); + + return 0; +} + +template +int CreateAclTensor(const std::vector &hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + auto size = GetShapeSize(shape) * sizeof(T); + // Call aclrtMalloc to allocate device memory + auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return 1); + + // Call aclrtMemcpy to copy host data to device memory + ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return 1); + + // Call aclCreateTensor to create a aclTensor object + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(), + shape.size(), *deviceAddr); + return 0; +} + +void DestroyResources(std::vector tensors, std::vector deviceAddrs, aclrtStream stream, + int32_t deviceId, void *workspaceAddr = nullptr) +{ + // Release aclTensor and device + for (uint32_t i = 0; i < tensors.size(); i++) { + if (tensors[i] != nullptr) { + aclDestroyTensor(reinterpret_cast(tensors[i])); + } + if (deviceAddrs[i] != nullptr) { + aclrtFree(deviceAddrs[i]); + } + } + if (workspaceAddr != nullptr) { + aclrtFree(workspaceAddr); + } + // Destroy stream and reset device + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); +} + +void InitializeData(std::vector &inputX, std::vector &outputY, std::vector &golden, + std::vector &inputXShape, std::vector &outputYShape) +{ + for (int i = 0; i < inputXShape[0] * inputXShape[1]; ++i) { + inputX[i] = 1.0; + } + float dealResult = 8.0; + for (int i = 0; i < outputYShape[0] * outputYShape[1]; ++i) { + golden[i] = dealResult; + outputY[i] = 0.0; + } +} + +bool VerifyResults(const std::vector &goldenData, const std::vector &resultData) +{ + int64_t len = 10; + LOG_PRINT("result is:\n"); + for (int64_t i = 0; i < len; i++) { + LOG_PRINT("%.1f ", resultData[i]); + } + LOG_PRINT("\n"); + if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) { + LOG_PRINT("test pass\n"); + } else { + LOG_PRINT("test failed\n"); + return false; + } + return true; +} +} + +int main(int argc, char **argv) +{ + // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl + // Update deviceId to your own device id + int32_t deviceId = 0; + aclrtStream stream; + auto ret = Init(deviceId, &stream); + CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return 1); + + // 2. Create input and output, need to customize according to the interface of the API + std::vector inputXShape = {8, 2048}; + std::vector outputYShape = {1, 2048}; + void *inputXDeviceAddr = nullptr; + void *outputYDeviceAddr = nullptr; + aclTensor *inputX = nullptr; + aclTensor *outputY = nullptr; + std::vector inputXHostData(inputXShape[0] * inputXShape[1]); + std::vector outputYHostData(outputYShape[0] * outputYShape[1]); + std::vector goldenData(outputYShape[0] * outputYShape[1]); + + InitializeData(inputXHostData, outputYHostData, goldenData, inputXShape, outputYShape); + std::vector tensors = {inputX, outputY}; + std::vector deviceAddrs = {inputXDeviceAddr, outputYDeviceAddr}; + // Create inputX aclTensor + ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + // Create outputY aclTensor + ret = CreateAclTensor(outputYHostData, outputYShape, &outputYDeviceAddr, aclDataType::ACL_FLOAT, &outputY); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + // Create dimOut aclIntArray + std::vector dim{0}; + aclIntArray* dimOut = aclCreateIntArray(dim.data(), dim.size()); + // 3. Call the API of the custom operator library + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + // Calculate the workspace size and allocate memory for it + ret = aclnnReduceSumCustomGetWorkspaceSize(inputX, dimOut, outputY, &workspaceSize, &executor); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnReduceSumCustomGetWorkspaceSize failed. ERROR: %d\n", ret); + DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1); + void *workspaceAddr = nullptr; + if (workspaceSize > 0U) { + ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + } + // Execute the custom operator + ret = aclnnReduceSumCustom(workspaceAddr, workspaceSize, executor, stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnReduceSumCustom failed. ERROR: %d\n", ret); + DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 4. (Fixed code) Synchronize and wait for the task to complete + ret = aclrtSynchronizeStream(stream); + CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the + // interface of the API + auto size = GetShapeSize(outputYShape); + std::vector resultData(size, 0); + ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputYDeviceAddr, + size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); + DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1); + + // 6. Detroy resources, need to modify according to the interface of the API + DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); + + // print the output result + if (!VerifyResults(goldenData, resultData)) { + return -1; + } + return 0; +} diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9bb88021ddeeb0aaca15af2eb5f463cbe5a3a8fe --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh @@ -0,0 +1,26 @@ +#!/bin/bash +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +set -e +rm -rf build +mkdir -p build +cmake -B build -DCMAKE_SKIP_RPATH=TRUE +cmake --build build -j +( + cd build + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + ./execute_reduce_sum_op +) diff --git a/atvc/examples/aclnn/reduce_sum/README.md b/atvc/examples/aclnn/reduce_sum/README.md new file mode 100644 index 0000000000000000000000000000000000000000..693753811240e87033d47a734414852b0bad8299 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/README.md @@ -0,0 +1,105 @@ +## 概述 +本样例基于ReduceSumCustom算子工程,介绍了基于ATVC的单算子工程、单算子调用。 + +## 目录结构介绍 +``` +├── reduce_sum // 使用框架调用的方式调用ReduceSum算子 +│ ├── AclNNInvocation // 通过aclnn调用的方式调用ReduceSumCustom算子 +│ ├── ReduceSumCustom // ReduceSumCustom算子工程 +│ ├── ReduceSumCustom.json // ReduceSumCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 +ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +## 算子规格描述 + + + + + + + + + + +
算子类型(OpType)ReduceSum
算子输入nameshapedata typeformat
x8 * 2048float,intND
算子输出y1 * 2048float,intND
核函数名reduce_sum_custom
+ +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品 + +## 算子工程介绍 +其中,算子工程目录ReduceSumCustom包含算子的实现文件,如下所示: +``` +├── ReduceSumCustom // ReduceSumCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ ├── op_kernel // kernel侧实现文件 +│ ├── build.sh // 算子构建入口 +│ └── CMakeLists.txt // 算子的cmake文件 +``` +CANN软件包中提供了工程创建工具msOpGen,ReduceSumCustom算子工程可通过ReduceSumCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在ReduceSumCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 +针对自定义算子工程,编译运行包含如下步骤: +- 调用msOpGen工具生成自定义算子工程; +- 基于ATVC框架完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 +### 1. 获取源码包及环境配置 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + - 切换到msOpGen脚本install.sh所在目录 + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ./atvc/examples/aclnn/reduce_sum + ``` + + - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + 运行install.sh脚本 + ```bash + bash install.sh -v [SOC_VERSION] + ``` + 参数说明: + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + +### 3. 部署自定义算子包 +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + 参数说明: + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 + +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 +- [aclnn调用ReduceSumCustom算子工程](./AclNNInvocationNaive/README.md) + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ---------------------------- | +| 2025/07/24 | 新增readme | diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json new file mode 100644 index 0000000000000000000000000000000000000000..c214368089dea1b9c4d98f18576d7b7e7bd0fe31 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json @@ -0,0 +1,41 @@ +[ + { + "op": "ReduceSumCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND", + "ND" + ], + "type": [ + "float32", + "int32" + ] + } + ], + "output_desc": [ + { + "name": "y", + "param_type": "required", + "format": [ + "ND", + "ND" + ], + "type": [ + "float32", + "int32" + ] + } + ], + "attr": [ + { + "name": "dim", + "param_type": "required", + "type": "list_int" + } + ] + } +] \ No newline at end of file diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e1f43e5a2ed42c2ee3395308378e61eb15606f --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt @@ -0,0 +1,76 @@ +cmake_minimum_required(VERSION 3.16.0) +project(opp) + +include(cmake/config.cmake) +include(cmake/func.cmake) +include(cmake/intf.cmake) + +if(ENABLE_CROSS_COMPILE) + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64) + set(CROSS_COMPILE_PLATFORM aarch64) + else() + set(CROSS_COMPILE_PLATFORM x86_64) + endif() + set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR}) + set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/) + set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/) + if(CMAKE_CROSS_LIBRARY_PATH) + set(CMAKE_COMPILE_COMPILER_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH}) + set(CMAKE_COMPILE_RUNTIME_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH}) + endif() + set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM}) + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER}) +else() + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) +endif() + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) + add_subdirectory(framework) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) + add_subdirectory(op_host) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) + add_subdirectory(op_kernel) +endif() +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# modify vendor_name in install.sh and upgrade.sh +add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts + COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/ + COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/* +) +add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh) + +get_system_info(SYSTEM_INFO) + +# gen version.info +add_custom_target(gen_version_info ALL + COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR} +) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ) + + install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info + DESTINATION packages/vendors/${vendor_name}/) + + # CPack config + set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) + set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) + set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") + set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) + set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") + set(CPACK_GENERATOR External) + set(CPACK_CMAKE_GENERATOR "Unix Makefiles") + set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) + set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) + set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}) + include(CPack) +endif() diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e234f442aedc54b40f2d557d5ead27d7e8d880f5 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake @@ -0,0 +1,377 @@ +include(ExternalProject) +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() +function(get_system_info SYSTEM_INFO) + if (UNIX) + execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP) + string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP}) + set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE) + elseif (WIN32) + message(STATUS "System is Windows. Only for pre-build.") + else () + message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.") + endif () +endfunction() + +function(opbuild) + message(STATUS "Opbuild generating sources") + cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX;ENABLE_SOURCE" "OPS_SRC" ${ARGN}) + execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++17 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0 + -I ${ASCEND_CANN_PACKAGE_PATH}/include -I ${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel + -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api -I ${ATVC_PATH} + -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message("build ops lib info: ${EXEC_INFO}") + message("build ops lib error: ${EXEC_ERROR}") + message(FATAL_ERROR "opbuild run failed!") + endif() + set(proj_env "") + set(prefix_env "") + if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x") + set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}") + endif() + if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x") + set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}") + endif() + + set(ENV{ENABLE_SOURCE_PACAKGE} ${OPBUILD_ENABLE_SOURCE}) + if(${ASCEND_PACK_SHARED_LIBRARY}) + if (NOT vendor_name) + message(FATAL_ERROR "ERROR: vendor_name is invalid!") + return() + endif() + set(ENV{ASCEND_VENDOR_NAME} ${vendor_name}) + set(ENV{OPS_PRODUCT_NAME} ${ASCEND_COMPUTE_UNIT}) + set(ENV{SYSTEM_PROCESSOR} ${CMAKE_SYSTEM_PROCESSOR}) + endif() + execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build + ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + unset(ENV{ENABLE_SOURCE_PACAKGE}) + if(${ASCEND_PACK_SHARED_LIBRARY}) + unset(ENV{ASCEND_VENDOR_NAME}) + unset(ENV{OPS_PRODUCT_NAME}) + unset(ENV{SYSTEM_PROCESSOR}) + endif() + if (${EXEC_RESULT}) + message("opbuild ops info: ${EXEC_INFO}") + message("opbuild ops error: ${EXEC_ERROR}") + endif() + message(STATUS "Opbuild generating sources - done") +endfunction() + +function(add_ops_info_target) + cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN}) + get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY) + add_custom_command(OUTPUT ${OPINFO_OUTPUT} + COMMAND mkdir -p ${opinfo_file_path} + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py + ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT} + ) + add_custom_target(${OPINFO_TARGET} ALL + DEPENDS ${OPINFO_OUTPUT} + ) + if(NOT ${ASCEND_PACK_SHARED_LIBRARY}) + install(FILES ${OPINFO_OUTPUT} + DESTINATION ${OPINFO_INSTALL_DIR} + ) + endif() +endfunction() + +function(add_ops_compile_options OP_TYPE) + cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN}) + execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_gen_options.py + ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} ${OP_TYPE} ${OP_COMPILE_COMPUTE_UNIT} + ${OP_COMPILE_OPTIONS} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR) + if (${EXEC_RESULT}) + message("add ops compile options info: ${EXEC_INFO}") + message("add ops compile options error: ${EXEC_ERROR}") + message(FATAL_ERROR "add ops compile options failed!") + endif() +endfunction() + +function(add_npu_support_target) + cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN}) + get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY) + add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json + COMMAND mkdir -p ${NPUSUP_OUT_DIR} + COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh + ${NPUSUP_OPS_INFO_DIR} + ${NPUSUP_OUT_DIR} + ) + add_custom_target(npu_supported_ops ALL + DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json + ) + if(NOT ${ASCEND_PACK_SHARED_LIBRARY}) + install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json + DESTINATION ${NPUSUP_INSTALL_DIR} + ) + endif() +endfunction() + +function(add_simple_kernel_compile) + set(options "") + set(single_value_args "OPS_INFO;OUT_DIR;TILING_LIB;OP_TYPE;SRC;COMPUTE_UNIT;JSON_FILE;DYNAMIC_PATH") + set(multi_value_args "OPTIONS;CONFIGS") + cmake_parse_arguments(BINCMP "${options}" "${single_value_args}" "${multi_value_args}" ${ARGN}) + if (NOT DEFINED BINCMP_OUT_DIR) + set(BINCMP_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary) + endif() + if (NOT DEFINED BINCMP_TILING_LIB) + set(BINCMP_TILING_LIB $) + endif() + if (${ASCEND_PACK_SHARED_LIBRARY}) + if (NOT TARGET op_kernel_pack) + add_custom_target(op_kernel_pack + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_kernel.py + --input-path=${BINCMP_OUT_DIR} + --output-path=${BINCMP_OUT_DIR}/library + --enable-library=${ASCEND_PACK_SHARED_LIBRARY} + --platform=${CMAKE_SYSTEM_PROCESSOR}) + add_library(ascend_kernels INTERFACE) + target_link_libraries(ascend_kernels INTERFACE kernels) + target_link_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library) + target_include_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library) + add_dependencies(ascend_kernels op_kernel_pack) + add_dependencies(op_kernel_pack ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}) + endif() + endif() + # add Environment Variable Configurations of ccache + set(_ASCENDC_ENV_VAR) + if(${CMAKE_CXX_COMPILER_LAUNCHER} MATCHES "ccache$") + list(APPEND _ASCENDC_ENV_VAR export ASCENDC_CCACHE_EXECUTABLE=${CMAKE_CXX_COMPILER_LAUNCHER} &&) + endif() + + if (NOT DEFINED BINCMP_OPS_INFO) + set(BINCMP_OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${BINCMP_COMPUTE_UNIT}-ops-info.ini) + endif() + if (NOT ${ENABLE_CROSS_COMPILE}) + add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} + COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py + --op-name=${BINCMP_OP_TYPE} + --src-file=${BINCMP_SRC} + --compute-unit=${BINCMP_COMPUTE_UNIT} + --compile-options=\"${BINCMP_OPTIONS}\" + --debug-config=\"${BINCMP_CONFIGS}\" + --config-ini=${BINCMP_OPS_INFO} + --tiling-lib=${BINCMP_TILING_LIB} + --output-path=${BINCMP_OUT_DIR} + --dynamic-dir=${BINCMP_DYNAMIC_PATH} + --enable-binary=\"${ENABLE_BINARY_PACKAGE}\" + --json-file=${BINCMP_JSON_FILE} + --build-tool=$(MAKE)) + add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} cust_optiling) + else() + if (${ENABLE_BINARY_PACKAGE} AND NOT DEFINED HOST_NATIVE_TILING_LIB) + message(FATAL_ERROR "Native host libs was not set for cross compile!") + endif() + add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} + COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py + --op-name=${BINCMP_OP_TYPE} + --src-file=${BINCMP_SRC} + --compute-unit=${BINCMP_COMPUTE_UNIT} + --compile-options=\"${BINCMP_OPTIONS}\" + --debug-config=\"${BINCMP_CONFIGS}\" + --config-ini=${BINCMP_OPS_INFO} + --tiling-lib=${HOST_NATIVE_TILING_LIB} + --output-path=${BINCMP_OUT_DIR} + --dynamic-dir=${BINCMP_DYNAMIC_PATH} + --enable-binary=\"${ENABLE_BINARY_PACKAGE}\" + --json-file=${BINCMP_JSON_FILE} + --build-tool=$(MAKE)) + endif() + add_dependencies(ascendc_bin_${BINCMP_COMPUTE_UNIT}_gen_ops_config ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}) + add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} ops_info_gen_${BINCMP_COMPUTE_UNIT}) +endfunction() + +function(ascendc_device_library) + message(STATUS "Ascendc device library generating") + cmake_parse_arguments(DEVICE "" "TARGET;OPTION" "SRC" ${ARGN}) + execute_process( + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt + ) + execute_process( + COMMAND ${CMAKE_COMMAND} -E echo "cmake_minimum_required(VERSION 3.16.0)\nproject(cust_tiling_sink)\ninclude(${CMAKE_SOURCE_DIR}/cmake/device_task.cmake)\n" + OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt + RESULT_VARIABLE result + ) + string(REPLACE ";" " " DEVICE_SRC "${DEVICE_SRC}") + ExternalProject_Add(tiling_sink_task + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink + CONFIGURE_COMMAND ${CMAKE_COMMAND} + -DASCEND_CANN_PACKAGE_PATH=${ASCEND_CANN_PACKAGE_PATH} + -DTARGET=${DEVICE_TARGET} + -DOPTION=${DEVICE_OPTION} + -DSRC=${DEVICE_SRC} + -DVENDOR_NAME=${vendor_name} + + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + INSTALL_COMMAND "" + BUILD_ALWAYS TRUE + ) + ExternalProject_Get_Property(tiling_sink_task BINARY_DIR) + set(TILINGSINK_LIB_PATH "") + if ("${DEVICE_OPTION}" STREQUAL "SHARED") + set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.so") + else() + set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.a") + endif() + install(FILES ${TILINGSINK_LIB_PATH} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_master_device/lib + ) +endfunction() +function(add_opregistry_target) + string(REPLACE ";" "-" COMPUTE_UNIT "${ASCEND_COMPUTE_UNIT}") + add_custom_target(op_registry_pack + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_opregistry.py + --input-path=${CMAKE_SOURCE_DIR}/build_out/ + --base-path=${CMAKE_SOURCE_DIR}/build_out/tmp/vendors/ + --output-path=${CMAKE_SOURCE_DIR}/build_out/library/ + --vendor-name=${vendor_name} + --compute-unit=${COMPUTE_UNIT} + --framework-type=${ASCEND_FRAMEWORK_TYPE} + --platform=${CMAKE_SYSTEM_PROCESSOR}) + add_library(ascend_opregistry INTERFACE) + target_link_libraries(ascend_opregistry INTERFACE opregistry) + target_link_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library) + target_include_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library) + add_dependencies(ascend_opregistry op_registry_pack) + if(EXISTS "${CMAKE_SOURCE_DIR}/framework/caffe_plugin") + add_dependencies(op_registry_pack cust_caffe_parsers) + elseif(EXISTS "${CMAKE_SOURCE_DIR}/framework/tf_plugin") + add_dependencies(op_registry_pack cust_tf_parsers) + elseif(EXISTS "${CMAKE_SOURCE_DIR}/framework/onnx_plugin") + add_dependencies(op_registry_pack cust_onnx_parsers) + endif() +endfunction() + +function(add_kernels_install) + # install kernel file + if (${ENABLE_SOURCE_PACKAGE}) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic/ + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic/ + ) + endif() + + # install *.o files and *.json files + if (${ENABLE_BINARY_PACKAGE}) + set(INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/) + foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/ + DESTINATION ${INSTALL_DIR}/kernel/${compute_unit}/ + ) + endforeach() + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/config/ + DESTINATION ${INSTALL_DIR}/kernel/config/ + ) + endif() +endfunction() + +function(add_kernels_compile) + set(DYNAMIC_PATH "") + if (${ENABLE_SOURCE_PACKAGE}) + set(DYNAMIC_PATH ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic) + execute_process(COMMAND sh -c "mkdir -p ${DYNAMIC_PATH} && + cp -rf ${CMAKE_SOURCE_DIR}/op_kernel/* ${DYNAMIC_PATH}/ && + rm ${DYNAMIC_PATH}/CMakeLists.txt" + RESULT_VARIABLE EXEC_RESULT + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message(FATAL_ERROR, "copy_source_files failed, gen error:${EXEC_ERROR}" ) + endif() + endif() + + foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # define a target:binary to prevent kernel file from being rebuilt during the preinstall process + if (NOT TARGET binary) + add_custom_target(binary) + endif() + + if (${ENABLE_BINARY_PACKAGE} OR ${ENABLE_SOURCE_PACKAGE}) + if (${ENABLE_BINARY_PACKAGE}) + # gen binary_info_config.json and .json + add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py + -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py + -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + -s ${compute_unit} + COMMAND ${CMAKE_COMMAND} -E make_directory + ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit} + COMMAND mv ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/*.json + ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit} + ) + else() + if (NOT TARGET ascendc_bin_${compute_unit}_gen_ops_config) + add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config) + endif() + endif() + add_dependencies(binary ascendc_bin_${compute_unit}_gen_ops_config) + + # get op_type-op_name from aic-${compute_unit}-ops-info.ini + execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_get_op_name.py + --ini-file=${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + OUTPUT_VARIABLE OP_TYPE_NAME + RESULT_VARIABLE EXEC_RESULT + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message(FATAL_ERROR, "get op name failed, gen error: ${EXEC_ERROR}") + endif() + + # compile op one by one with ascendc_compile_kernel.py + string(REPLACE "\n" ";" TYPE_NAME_LIST "${OP_TYPE_NAME}") + foreach(TYPE_NAME IN LISTS TYPE_NAME_LIST) + if (NOT "${TYPE_NAME}" STREQUAL "") + string(REPLACE "-" ";" bin_sep ${TYPE_NAME}) + list(GET bin_sep 0 op_type) + list(GET bin_sep 1 op_file) + add_simple_kernel_compile(OP_TYPE ${op_type} + SRC ${CMAKE_SOURCE_DIR}/op_kernel/${op_file}.cpp + COMPUTE_UNIT ${compute_unit} + JSON_FILE ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + DYNAMIC_PATH ${DYNAMIC_PATH}) + endif() + endforeach() + endif() + endforeach() + + # generate npu_supported_ops.json + add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} + ) + + if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) + endif() + + if(NOT ASCEND_PACK_SHARED_LIBRARY) + add_kernels_install() + else() + add_opregistry_target() + endif() +endfunction() \ No newline at end of file diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a6279ff469e98190072e341d2058941f197ccbf --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt @@ -0,0 +1,183 @@ +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +file(GLOB group_proto_src ${ASCEND_AUTOGEN_PATH}/group_proto/*.cc) + +add_library(cust_op_proto SHARED + $<$:${group_proto_src}> + ${ops_srcs} + ${ASCEND_AUTOGEN_PATH}/op_proto.cc +) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_include_directories(cust_op_proto PRIVATE ${ATVC_PATH}) +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +file(GLOB fallback_src ${ASCEND_AUTOGEN_PATH}/fallback_*.cpp) +add_library(cust_optiling SHARED ${ops_srcs}) +if (${fallback_src}) + target_sources(cust_optiling PRIVATE ${fallback_src}) +endif() +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +target_include_directories(cust_optiling PRIVATE + ${ATVC_PATH} +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + nnopbase + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + add_library(cust_opapi SHARED ${aclnn_src}) +else() + file(GLOB op_registry ${ASCEND_AUTOGEN_PATH}/custom_op_registry.cpp) + add_library(cust_opapi SHARED ${aclnn_src} ${op_registry}) + target_compile_definitions(cust_opapi PRIVATE ACLNN_WITH_BINARY) +endif() + +target_include_directories(cust_opapi PRIVATE + ${ATVC_PATH} +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +if(NOT ASCEND_PACK_SHARED_LIBRARY) + target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) +else() + add_library(cust_op_proto_obj OBJECT + $<$:${group_proto_src}> + ${ops_srcs} + ${ASCEND_AUTOGEN_PATH}/op_proto.cc + ) + target_compile_definitions(cust_op_proto_obj PRIVATE OP_PROTO_LIB) + target_compile_options(cust_op_proto_obj PRIVATE + -fvisibility=hidden + ) + if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto_obj PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) + endif() + target_link_libraries(cust_op_proto_obj PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive + ) + add_library(cust_optiling_obj OBJECT ${ops_srcs}) + target_compile_definitions(cust_optiling_obj PRIVATE OP_TILING_LIB) + target_compile_options(cust_optiling_obj PRIVATE + -fvisibility=hidden + ) + target_include_directories(cust_optiling_obj PRIVATE ${ATVC_PATH}) + if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling_obj PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) + endif() + + target_link_libraries(cust_optiling_obj PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive + ) + target_compile_options(cust_opapi PRIVATE -DLOG_CPP) + target_include_directories(cust_opapi INTERFACE ${ATVC_PATH}) + target_include_directories(cust_opapi INTERFACE ${CMAKE_SOURCE_DIR}/build_out/binary/) + target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase cust_optiling_obj cust_op_proto_obj ascend_opregistry ascend_kernels) + add_dependencies(cust_opapi ascend_opregistry) +endif() + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +if(NOT ASCEND_PACK_SHARED_LIBRARY) + install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) + install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) + file(GLOB GROUP_PROTO_HEADERS ${ASCEND_AUTOGEN_PATH}/group_proto/*.h) + if (GROUP_PROTO_HEADERS) + install(FILES ${GROUP_PROTO_HEADERS} + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) + endif() + install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) + install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) + install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) +else() + file(GLOB group_inc ${ASCEND_AUTOGEN_PATH}/group_proto/*.h) + install(TARGETS cust_opapi + LIBRARY DESTINATION op_api/lib) + install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION op_api/include) + install(FILES ${group_inc} + DESTINATION op_api/include) + install(FILES ${aclnn_inc} + DESTINATION op_api/include) +endif() diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df23786935faa9f8f689dff799b3362a395bfa3c --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp @@ -0,0 +1,85 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "reduce/reduce_host.h" +#include "register/op_def_registry.h" + +using ReduceOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; +using ReduceOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + ATVC::ReducePolicy policy = {0, 0, 0}; + auto inputShape0 = context->GetInputShape(0)->GetOriginShape(); + std::vector shapeIn; + for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) { + shapeIn.push_back(inputShape0.GetDim(i)); + } + // 获取dim值 + const gert::RuntimeAttrs *runtimeAttrs = context->GetAttrs(); + const gert::TypedContinuousVector *attr0 = runtimeAttrs->GetListInt(0); + const int64_t *arr = reinterpret_cast(attr0->GetData()); + std::vector dim(arr, arr + attr0->GetSize()); + ATVC::ReduceParam *tiling = context->GetTilingData(); + if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) { + // ReduceOpTraitsFloat为Reduce算子描述原型,根据算子输入shape和dim计算出Tiling数据后填入tiling中 + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, tiling); + } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) { + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, tiling); + } + // 设置tiling的policyId为policy的id + tiling->policyId = policy.getID(); + context->SetBlockDim(tiling->tilingData.coreNum); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext *context) +{ + const gert::Shape *x1_shape = context->GetInputShape(0); + gert::Shape *y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static graphStatus InferDataType(gert::InferDataTypeContext *context) +{ + const auto inputDataType = context->GetInputDataType(0); + context->SetOutputDataType(0, inputDataType); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class ReduceSumCustom : public OpDef { +public: + explicit ReduceSumCustom(const char *name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}); + this->Attr("dim").AttrType(REQUIRED).ListInt(); + + this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + } +}; +OP_ADD(ReduceSumCustom); +} // namespace ops diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a719a60cf480bf01d15b9d1905038536d5e5ccb --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_VERSION_MAKEFILE ON) +if(DEFINED ENV{ATVC_PATH}) + set(ATVC_PATH $ENV{ATVC_PATH}) +else() + set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include") +endif() +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +add_ops_compile_options(ALL OPTIONS -g -O0 --cce-aicore-block-local-init -I ${ATVC_PATH}) +add_kernels_compile() \ No newline at end of file diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50a5c73f975c464282ae32cc46da52666ca19529 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp @@ -0,0 +1,91 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "reduce/reduce_device.h" + +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +extern "C" __global__ __aicore__ void reduce_sum_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); + REGISTER_TILING_DEFAULT(ATVC::ReduceParam); + GET_TILING_DATA(param, tiling); + if (param.policyId == ATVC::REDUCE_POLICY0.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY0>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY1.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY1>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY2.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY2>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY3.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY3>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY4.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY4>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY5.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY5>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY6.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY6>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY7.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY7>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY8.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY8>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY9.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY9>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY10.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY10>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY11.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY11>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY12.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY12>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY13.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY13>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY14.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY14>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY15.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY15>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY16.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY16>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY17.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY17>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY18.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY18>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY19.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY19>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY20.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY20>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY21.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY21>(); + op.Run(x, y, tiling); + } else if (param.policyId == ATVC::REDUCE_POLICY22.ID) { + auto op = ATVC::Kernel::ReduceOpTemplate, ATVC::REDUCE_POLICY22>(); + op.Run(x, y, tiling); + } +} diff --git a/atvc/examples/aclnn/reduce_sum/install.sh b/atvc/examples/aclnn/reduce_sum/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..afe455ade5b53bd16ac6b4206650232bf32cf8e2 --- /dev/null +++ b/atvc/examples/aclnn/reduce_sum/install.sh @@ -0,0 +1,57 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=ReduceSumCustom +rm -rf CustomOp +# Generate the op framework +msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +cp -rf $OP_NAME/* CustomOp +# Delete tiling.h +rm -rf CustomOp/op_host/*._tiling.h +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp index 04ea4a22766471183cb00a790918d83d7c443c16..f5a410a114299bbff4b620dd15bce57201812688 100644 --- a/atvc/examples/add/add.cpp +++ b/atvc/examples/add/add.cpp @@ -15,7 +15,8 @@ #include #include #include "acl/acl.h" -#include "atvc.h" +#include "elewise/elewise_host.h" +#include "elewise/elewise_device.h" #define CHECK_ACL(x) \ do { \ @@ -36,7 +37,7 @@ bool IsClose(float a, float b) float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); } -} + // Add算子中有两个输入,一个输出。类型均为float using ADD_OPTRAITS = ATVC::OpTraits, ATVC::OpOutputs>; @@ -57,6 +58,59 @@ struct AddComputeFunc { } }; +void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &inputY, std::vector &golden) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(1.0f, 100.0f); + + for (int i = 0; i < eleNum; ++i) { + inputX[i] = dis(gen); + inputY[i] = dis(gen); + golden[i] = inputX[i] + inputY[i]; + } +} + +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice, uint8_t *¶mDevice) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFree(paramDevice)); + CHECK_ACL(aclrtFreeHost(zHost)); +} +} + template /* * 该函数为Add算子核函数入口 @@ -79,38 +133,23 @@ int main() int32_t eleNum = 8 * 1024; size_t inputByteSize = static_cast(eleNum) * sizeof(float); size_t outputByteSize = static_cast(eleNum) * sizeof(float); - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(1.0f, 100.0f); std::vector inputX(eleNum); std::vector inputY(eleNum); std::vector golden(eleNum); + InitializeData(eleNum, inputX, inputY, golden); - // 生成输入数据 - for (int i = 0; i < eleNum; ++i) { - inputX[i] = (dis(gen)); - inputY[i] = (dis(gen)); - } - for (int i = 0; i < eleNum; ++i) { - golden[i] = (inputX[i]) + (inputY[i]); - } - printf("Generate golden data successfully.\n"); - // 声明运行态参数param - ATVC::EleWiseParam param; + aclrtContext context; + aclrtStream stream = nullptr; + int32_t deviceId = 0; + InitializeACL(context, stream, deviceId); + ATVC::EleWiseParam param; if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { printf("Elewise tiling error.\n"); return -1; }; - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); + auto elementParamSize = sizeof(param); uint8_t *zHost; uint8_t *xDevice; @@ -118,45 +157,30 @@ int main() uint8_t *zDevice; uint8_t *paramDevice; - CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize)); + CHECK_ACL(aclrtMallocHost((void **)&zHost, outputByteSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, inputY.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - auto elementParamSize = sizeof(param); - CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, - reinterpret_cast(¶m), elementParamSize, - ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast(¶m), + elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); uint32_t blockNum = param.tilingData.blockNum; // 调用核函数 AddCustom<<>>(xDevice, yDevice, zDevice, paramDevice); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - - std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(zDevice)); - CHECK_ACL(aclrtFree(paramDevice)); - CHECK_ACL(aclrtFreeHost(zHost)); + std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); + CleanUp(zHost, xDevice, yDevice, zDevice, paramDevice); + CleanACL(stream, deviceId); - for (int32_t i = 0; i < eleNum; i++) { - if (!IsClose(golden[i], outputZ[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, golden[i], outputZ[i]); - return -1; - } + if (!VerifyResults(golden, outputZ)) { + return -1; } printf("Accuracy verification passed.\n"); return 0; diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp index c7aa38993b6f87bfae09e4f5928b62034857ec5c..a2f60e3943e6918a333ad3d6cc6b8bfac91999d9 100644 --- a/atvc/examples/add_with_scalar/add_with_scalar.cpp +++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp @@ -15,7 +15,8 @@ #include #include #include "acl/acl.h" -#include "atvc.h" +#include "elewise/elewise_host.h" +#include "elewise/elewise_device.h" #define CHECK_ACL(x) \ do { \ @@ -36,7 +37,6 @@ bool IsClose(float a, float b) float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); } -} using OP_TRAITS = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; @@ -55,6 +55,64 @@ struct AddComputeFunc { } }; +void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &inputY, std::vector &golden, + bool conditionVal) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(1.0f, 100.0f); + + for (int i = 0; i < eleNum; ++i) { + inputX[i] = dis(gen); + inputY[i] = dis(gen); + if (conditionVal) { + golden[i] = 2 * (inputX[i]) + (inputY[i]); // z = 2 * x + y + } else { + golden[i] = 2 * (inputX[i]) - (inputY[i]); // z = 2 * x - y + } + } +} + +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice, uint8_t *¶mDevice) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFree(paramDevice)); + CHECK_ACL(aclrtFreeHost(zHost)); +} +} + /* * 该函数为AddCustom算子核函数入口 * a Device上的gm地址,指向算子第一个输入 @@ -71,46 +129,31 @@ __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR pa op.Run(a, b, c, param, conditionVal); // 调用Run函数, 执行算子 } + int main() { int32_t eleNum = 8 * 1024; size_t inputByteSize = static_cast(eleNum) * sizeof(float); size_t outputByteSize = static_cast(eleNum) * sizeof(float); - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(1.0f, 100.0f); std::vector inputX(eleNum); std::vector inputY(eleNum); std::vector golden(eleNum); bool conditionVal = false; // 生成输入数据 - for (int i = 0; i < eleNum; ++i) { - inputX[i] = (dis(gen)); - inputY[i] = (dis(gen)); - } - for (int i = 0; i < eleNum; ++i) { - if (conditionVal) { - golden[i] = 2 * (inputX[i]) + (inputY[i]); // z = 2 * x + y - } else { - golden[i] = 2 * (inputX[i]) - (inputY[i]); // z = 2 * x - y - } - } - printf("Generate golden data successfully.\n"); - ATVC::EleWiseParam param; + InitializeData(eleNum, inputX, inputY, golden, conditionVal); - if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { - printf("Elewise tiling error.\n"); - return -1; - }; - CHECK_ACL(aclInit(nullptr)); aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); + int32_t deviceId = 0; + InitializeACL(context, stream, deviceId); + + ATVC::EleWiseParam param; + if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { + printf("Elewise tiling error.\n"); + return -1; + }; + auto elementParamSize = sizeof(param); uint8_t *zHost; uint8_t *xDevice; @@ -118,45 +161,30 @@ int main() uint8_t *zDevice; uint8_t *paramDevice; - CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize)); + CHECK_ACL(aclrtMallocHost((void **)&zHost, outputByteSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, inputY.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast(¶m), + elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); - auto elementParamSize = sizeof(param); - CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, - reinterpret_cast(¶m), elementParamSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - AddCustom<<>>( - xDevice, yDevice, zDevice, paramDevice, conditionVal); + AddCustom + <<>>(xDevice, yDevice, zDevice, paramDevice, conditionVal); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - - std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(zDevice)); - CHECK_ACL(aclrtFree(paramDevice)); - CHECK_ACL(aclrtFreeHost(zHost)); + std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); + CleanUp(zHost, xDevice, yDevice, zDevice, paramDevice); + CleanACL(stream, deviceId); - for (int32_t i = 0; i < eleNum; i++) { - if (!IsClose(golden[i], outputZ[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, golden[i], outputZ[i]); + if (!VerifyResults(golden, outputZ)) { return -1; - } } printf("Accuracy verification passed.\n"); return 0; diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp index 9b4aa6df0a3aee5631b4370183258a5083e64fab..c3a0794413ded63fe04e34d679afeae1d02c9619 100644 --- a/atvc/examples/broadcast_to/broadcast_to.cpp +++ b/atvc/examples/broadcast_to/broadcast_to.cpp @@ -16,7 +16,8 @@ #include #include #include "acl/acl.h" -#include "atvc.h" +#include "broadcast/broadcast_host.h" +#include "broadcast/broadcast_device.h" #define CHECK_ACL(x) \ do { \ @@ -36,10 +37,10 @@ bool IsClose(float a, float b) { float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); } -} // BroadcastTo算子的描述:一个输入,一个输出,类型均为float using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; +} /* * 该函数为BroadcastCustom算子核函数入口 @@ -56,7 +57,6 @@ __global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, GM_ADDR broadca op.Run(x, y, broadcastParam); } - // 负责Broadcast类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 template void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam ¶m, ATVC::BroadcastPolicy &policy, aclrtStream& stream) @@ -113,7 +113,7 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 + ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 if (!ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m)) { printf("Broadcast tiling error.\n"); diff --git a/atvc/examples/pytorch/README.md b/atvc/examples/pytorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..945054b730a2b53ca50fc14102e4bf89d65c174b --- /dev/null +++ b/atvc/examples/pytorch/README.md @@ -0,0 +1,192 @@ + ## 概述 +使用ATVC对接pytorch工程简单的示例,适合初学者。 + +## pytorch算子样例说明 +样例通过Ascend C编程语言实现了ATVC框架对接pytorch算子,并按照算子调用方式分别给出了对应的端到端实现。 + +## 算子开发样例 +| 目录名称 | 功能描述 | +| ------------------------------------------------------------ | ---------------------------------------------------- | +| [add](./add) | 基于ATVC框架的Add自定义Vector算子 | +| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子 | + +## 快速上手 + + 快速执行example用例,更详细的流程请参阅[add算子](./add/README.md)。 + +- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/1_quick_start.md)。 + +- 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + export ATVC_PATH=${atvc}/include + ``` + + - 执行add用例 + ```bash + # 基于ATVC编译pytorch Add算子 + $ cd ./atvc/examples/pytorch/add + $ bash run.sh + ... + OK + ``` + +## 基于pytorch算子对接ATVC框架 + +### 步骤1. 定义算子描述,参考[add_custom_impl.h](./add/add_custom_impl.h) + + 首先通过ATVC提供的`ATVC::OpTraits`模板结构体来描述Add算子的输入输出信息,定义如下: +```cpp + // Add算子中有两个输入,一个输出。类型均为float + using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; +``` + +### 步骤2. 实现算子计算逻辑,参考[add_custom_impl.h](./add/add_custom_impl.h) + + 用户需要通过AscendC API来搭建Add算子的核心计算逻辑,在ATVC框架中,这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数,并重载`operator()`来被提供的Kernel层算子模板类调用。 + +```cpp + // 头文件引入 + #include "elewise/elewise_host.h" + #include "elewise/elewise_device.h" + + // 传入编译态参数ATVC::OpTraits + template + struct AddComputeFunc { + /* + 函数说明: z = x + y + 参数说明: + x : 参与运算的输入 + y : 参与运算的输入 + z : 参与运算的输出 + */ + template + // 重载operator,提供给算子模板类调用 + __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { + AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量 + } + }; +``` +### 步骤3. 实现核函数,参考[add_custom_impl.h](./add/add_custom_impl.h) + + ATVC提供的`ATVC::Kernel::EleWiseOpTemplate`算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化,用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算,完成完整核函数的实现。 在examples/add用例中,算子核函数的形式参数除了输入输出之外,还需额外传入`ATVC::EleWiseParam param`的形参,需要用户在初始化`ATVC::Kernel::EleWiseOpTemplate`时指定`ATVC::EleWiseParam*`的类型。该参数包含算子模板类进行数据搬运数据的必要参数,由`ATVC::Host::CalcEleWiseTiling` API计算得出。 + +```cpp + /* + * 该函数为Add算子核函数入口 + * a Device上的gm地址,指向Add算子第一个输入 + * b Device上的gm地址,指向Add算子第二个输入 + * c Device上的gm地址,指向Add算子第一个输出 + * param ATVC::EleWiseParam数据 + */ + template + __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, ATVC::EleWiseParam param) + { + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + auto op = ATVC::Kernel::EleWiseOpTemplate, ATVC::EleWiseParam*>(); // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类,并指明param的数据类型 + op.Run(x, y, z, ¶m); // 按照输入、输出、param的顺序传入Run函数,实现GM->GM的数据计算 + } +``` + 备注:reduce类算子在初始化的时候,` ATVC::Kernel::ReduceOpTemplate, Policy, void, void, ATVC::ReduceParam*>`时指定`ATVC::ReduceParam*`的类型;broadcast类算子在初始化的时候,` ATVC::Kernel::ATVC::Kernel::BroadcastOpTemplate, Policy, ATVC::BroadcastParam*>`时指定`ATVC::BroadcastParam*`的类型。 + + ### 步骤4. 编写torch入口[pytorch_ascendc_extension.cpp](./add/pytorch_ascendc_extension.cpp) + + 不同的算子类型可参考[快速入门](../../docs/1_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数,更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。 + +``` cpp + // 头文件引入 + #include + #include "torch_npu/csrc/core/npu/NPUStream.h" + #include "add_custom_impl.h" + + namespace ascendc_elewise_ops { + at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) + { + auto stream = c10_npu::getCurrentNPUStream().stream(false); + at::Tensor z = at::empty_like(x); + // 创建ATVC框架[elewise]所需要的数据,算子数据的长度 + int32_t totalLength = 1; + for (int32_t size : x.sizes()) { + totalLength *= size; + } + // 声明运行态参数param + ATVC::EleWiseParam param; + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + // 调用核函数 + AddCustom<<>>( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + + return z; + } + // 加载算子模版 + TORCH_LIBRARY(ascendc_ops, m) + { + // torch的自定义算子接口 + m.def("add", &ascendc_elewise_ops::op_add_custom); + } + } +``` +### 步骤5. 编写测试用例,参考[run_op.py](./add/run_op.py) +```python + # 导入torch torch所需依赖 + import torch + import torch_npu + # 导入测试用例依赖 + from torch_npu.testing.testcase import TestCase, run_tests + torch.npu.config.allow_internal_format = False + # 加载bishengcc编译出的二进制文件 + torch.ops.load_library('./libascendc_pytorch.so') + + # 测试用例编写 + class TestAscendCOps(TestCase): + # 测试用例 + def test_add_custom_ops_float(self): + length = [8, 2048] + # 生成随机数 + x = torch.rand(length, device='cpu', dtype=torch.float32) + y = torch.rand(length, device='cpu', dtype=torch.float32) + # 调用torch的自定义算子接口 + npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) + cpuout = torch.add(x, y) + self.assertRtolEqual(npuout, cpuout) + + if __name__ == '__main__': + run_tests() +``` +### 步骤6. 基于atvc框架pytorch的编译和测试脚本[run.sh](./add/run.sh) +```bash + # 获取torch、torch_npu、python的lib和include路径和atvc的路径 + torch_location=... + torch_npu_location=... + python_include=... + python_lib=... + atvc_path=... + + # 使用bishengcc进行编译pytorch算子 + bishengcc pytorch_ascendc_extension.cpp \ + -arch Ascend910B1 \ + -I${torch_location}/include \ + -I${torch_location}/include/torch/csrc/api/include \ + -I${python_include} \ + -I${atvc_path} \ + -I${torch_npu_location}/include \ + -L${torch_location}/lib \ + -L${torch_npu_location}/lib \ + -L${python_lib} \ + -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ + -o libascendc_pytorch.so \ + -shared + + # 执行测试用例 + python3 run_op.py +``` + +### 步骤7. 算子编译&执行 +```bash + # 基于ATVC编译pytorch Add算子 + $ cd ./atvc/examples/pytorch/add + $ bash run.sh + ... + OK +``` diff --git a/atvc/examples/pytorch/add/README.md b/atvc/examples/pytorch/add/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5df331b32080080cb36959de9ae11e4f1adf8b9d --- /dev/null +++ b/atvc/examples/pytorch/add/README.md @@ -0,0 +1,153 @@ + ## 概述 +本样例基于AddCustom算子工程,介绍了基于ATVC的单算子工程、单算子调用。 + +## 目录结构介绍 +``` +├── add +│ ├── add_custom_impl.h // 通过pytroch调用的方式调用Add算子 +│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 +│ ├── run_op.py // pytorch的测试用例 +│ └── run.sh // 脚本,编译需要的二进制文件,并测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048float,intND
y8 * 2048float,intND
算子输出z8 * 2048float,intND
核函数名AddCustom
+ +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品 + +## 算子工程介绍 +其中,算子工程目录AddCustom包含算子的实现文件,如下所示: +``` +├── add +│ ├── add_custom_impl.h // 通过pytroch调用的方式调用Add算子 +│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 +│ ├── run_op.py // pytorch的测试用例 +│ └── run.sh // 脚本,编译需要的二进制文件,并测试 +``` + +run.sh脚本编译pytorch所需要的算子依赖,并将执行测试用例。 + +## 编译运行样例算子 +针对自定义算子工程,编译运行包含如下步骤: +- 完成算子pytorch入口和impl文件的实现; +- 编译pytorch算子的二进制文件; +- 调用执行pytorch算子; + +详细操作如下所示。 +### 1. 获取源码包及环境配置 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 +### 1. 安装pytorch环境 +参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 + +### 2. 基于ATVC编写pytorch算子的实现 + - 编写kernel侧函数,参考[add_custom_impl.h](./add_custom_impl.h) + ```cpp + // 引入头文件 + #include "elewise/elewise_host.h" + #include "elewise/elewise_device.h" + + // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息,定义如下 + using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; + using AddOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + + // 实现算子计算逻辑 + // 传入编译态参数ATVC::OpTraits + template + struct AddComputeFunc { + /* + 函数说明: z = x + y + 参数说明: + x : 参与运算的输入 + y : 参与运算的输入 + z : 参与运算的输出 + */ + template + // 重载operator,提供给算子模板类调用 + __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { + AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量 + } + }; + + //实现核函数 + template + __global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param) + { + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类 + auto op = ATVC::Kernel::EleWiseOpTemplate, ATVC::EleWiseParam*>(); + op.Run(x, y, z, ¶m); + } + ``` + - 编写pytorch入口函数,并调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) + ```cpp + at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) + { + auto stream = c10_npu::getCurrentNPUStream().stream(false); + ATVC::EleWiseParam param; + int32_t totalLength = 1; + for (int32_t size : x.sizes()) { + totalLength *= size; + } + (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + AddCustom<<>>( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + } + ``` + - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py) + + ```py + # 引入头文件 + import torch + import torch_npu + import numpy as np + from torch_npu.testing.testcase import TestCase, run_tests + # 加载二进制 + torch.npu.config.allow_internal_format = False + torch.ops.load_library('./libascendc_pytorch.so') + + class TestAscendCOps(TestCase): + # 测试用例 + def test_add_custom_ops_float(self): + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float32) + y = torch.rand(length, device='cpu', dtype=torch.float32) + npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) + cpuout = torch.add(x, y) + self.assertRtolEqual(npuout, cpuout) + + # 测试用例调用 + if __name__ == '__main__': + run_tests() + ``` + +### 3. 基于ATVC编写pytorch算子的调用验证 + - 调用脚本,生成pytorch算子,并运行测试用例 + ```bash + $ cd ./atvc/examples/pytorch/add + $ bash run.sh + ... + OK + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ---------------------------- | +| 2025/07/24 | 新增readme | diff --git a/atvc/examples/pytorch/add/add_custom_impl.h b/atvc/examples/pytorch/add/add_custom_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..86a6912caaaa3264324a68740fa98a70fcb48566 --- /dev/null +++ b/atvc/examples/pytorch/add/add_custom_impl.h @@ -0,0 +1,52 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ADD_CUSTOM_IMPL_H +#define ADD_CUSTOM_IMPL_H +#include "elewise/elewise_host.h" +#include "elewise/elewise_device.h" + +using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; +using AddOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + +// 传入编译态参数ATVC::OpTraits +template +struct AddComputeFunc { + /* + 函数说明: z = x + y + 参数说明: + x : 参与运算的输入 + y : 参与运算的输入 + z : 参与运算的输出 + */ + template + // 重载operator,提供给算子模板类调用 + __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { + AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量 + } +}; + +/* + * 该函数为Add算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第二个输入 + * z Device上的gm地址,指向Add算子第一个输出 + * param ATVC::EleWiseParam数据 + */ +template +__global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类 + auto op = ATVC::Kernel::EleWiseOpTemplate, ATVC::EleWiseParam*>(); + op.Run(x, y, z, ¶m); +} +#endif diff --git a/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp b/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bc6444720e21762fbe052cbdff90c7225c77592b --- /dev/null +++ b/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp @@ -0,0 +1,46 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "add_custom_impl.h" + +namespace ascendc_elewise_ops { +at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) +{ + auto stream = c10_npu::getCurrentNPUStream().stream(false); + at::Tensor z = at::empty_like(x); + int32_t totalLength = 1; + for (int32_t size : x.sizes()) { + totalLength *= size; + } + // 声明运行态参数param + ATVC::EleWiseParam param; + if (x.scalar_type() == at::kFloat) { + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + // 调用核函数 + AddCustom<<>>( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + } else if (x.scalar_type() == at::kInt) { + (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + // 调用核函数 + AddCustom<<>>( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + } + return z; +} + +TORCH_LIBRARY(ascendc_ops, m) +{ + m.def("add", &ascendc_elewise_ops::op_add_custom); +} +} // namespace ascendc_elewise_ops \ No newline at end of file diff --git a/atvc/examples/pytorch/add/run.sh b/atvc/examples/pytorch/add/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2650c7dc98f5ea7b22a49b1a57a1c9d9b64a5d0c --- /dev/null +++ b/atvc/examples/pytorch/add/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +torch_location=$(python3 -c "import torch; print(torch.__path__[0])") +torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])") +python_include=$(python3 -c "import sysconfig; print(sysconfig.get_path('include'))") +python_lib=$(python3 -c "import sysconfig; print(sysconfig.get_path('stdlib'))") +export LD_LIBRARY_PATH=${torch_npu_location}/lib/:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=${torch_location}/lib/:$LD_LIBRARY_PATH +if [ -z "$ATVC_PATH" ]; then + atvc_path=$(realpath ../../../include) +else + atvc_path=$ATVC_PATH +fi + +rm -rf *.json +rm -rf libascendc_pytorch.so + + +bishengcc pytorch_ascendc_extension.cpp \ + -arch Ascend910B1 \ + -I${torch_location}/include \ + -I${torch_location}/include/torch/csrc/api/include \ + -I${python_include} \ + -I${atvc_path} \ + -I${torch_npu_location}/include \ + -L${torch_location}/lib \ + -L${torch_npu_location}/lib \ + -L${python_lib} \ + -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ + -o libascendc_pytorch.so \ + -shared + +python3 run_op.py + +if [ $? -ne 0 ]; then + echo "ERROR: verify result failed! the result is wrong!" + return 1 +fi + +rm -rf *.json +rm -rf libascendc_pytorch.so \ No newline at end of file diff --git a/atvc/examples/pytorch/add/run_op.py b/atvc/examples/pytorch/add/run_op.py new file mode 100644 index 0000000000000000000000000000000000000000..52ee702116e83f8f13ec22406c3090f1e17529ae --- /dev/null +++ b/atvc/examples/pytorch/add/run_op.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + +torch.npu.config.allow_internal_format = False +torch.ops.load_library('./libascendc_pytorch.so') + + +class TestAscendCOps(TestCase): + + def test_add_custom_ops_float(self): + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float32) + y = torch.rand(length, device='cpu', dtype=torch.float32) + npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) + cpuout = torch.add(x, y) + self.assertRtolEqual(npuout, cpuout) + + def test_add_custom_ops_int(self): + length = [8, 2048] + x = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32) + y = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32) + npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) + cpuout = torch.add(x, y) + self.assertRtolEqual(npuout, cpuout) + +if __name__ == '__main__': + run_tests() \ No newline at end of file diff --git a/atvc/examples/pytorch/reduce_sum/README.md b/atvc/examples/pytorch/reduce_sum/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e5115295f6cb02bfc8665dc82c9e0faa4172f104 --- /dev/null +++ b/atvc/examples/pytorch/reduce_sum/README.md @@ -0,0 +1,177 @@ + ## 概述 +本样例基于ReduceSum算子,介绍了基于ATVC的单算子工程、单算子调用。 + +## 目录结构介绍 +``` +├── reduce_sum +│ ├── reduce_sum_impl.h // 通过pytroch调用的方式调用ReduceSum算子 +│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 +│ ├── run_op.py // pytorch的测试用例 +│ └── run.sh // 脚本,编译需要的二进制文件,并测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + +
算子类型(OpType)ReduceSum
算子输入nameshapedata typeformat
x8 * 2048float,intND
算子输出y8 * 2048float,intND
核函数名ReduceSumCustom
+ +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品 + +## 算子工程介绍 +其中,算子工程目录ReduceSum包含算子的实现文件,如下所示: +``` +├── reduce_sum +│ ├── reduce_sum_impl.h // 通过pytroch调用的方式调用ReduceSum算子 +│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 +│ ├── run_op.py // pytorch的测试用例 +│ └── run.sh // 脚本,编译需要的二进制文件,并测试 +``` + +run.sh脚本编译pytorch所需要的算子依赖,并将执行测试用例。 + +## 编译运行样例算子 +针对自定义算子工程,编译运行包含如下步骤: +- 完成算子pytorch入口和impl文件的实现; +- 编译pytorch算子的二进制文件; +- 调用执行pytorch算子; + +详细操作如下所示。 +### 1. 获取源码包及环境配置 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 +### 1. 安装pytorch环境 +参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 + +### 2. 基于ATVC编写pytorch算子的实现 + - 编写kernel侧函数,参考[reduce_sum_impl.h](./reduce_sum_impl.h) + ```cpp + // 引入头文件 + #include "reduce/reduce_host.h" + #include "reduce/reduce_device.h" + + // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息,定义如下: + using ReduceOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; + using ReduceOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + + /* + * 该函数为ReduceSumCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam ATVC::ReduceParam + */ + template + __global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::ReduceParam param) + { + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + // ATVC::ReduceParam* 为tiling的类型 + auto op = ATVC::Kernel::ReduceOpTemplate, + Policy, + void, + void, + ATVC::ReduceParam*>(); + op.Run(x, y, &reduceParam); + } + ``` + - 编写pytorch入口函数,并调用kernel侧函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) + ```cpp + // pytorch 入口函数 + at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector &dim) + { + std::vector shapeIn; + std::vector shapeOut; + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + for (int32_t size : x.sizes()) { + shapeIn.push_back(size); + shapeOut.push_back(size); + } + for (const auto &i : dim) { + shapeOut[i] = 1; + } + auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device()); + at::Tensor y = at::empty(shapeOut, options); + if (x.scalar_type() == at::kFloat) { + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, ¶m); + // 调用Adapter调度接口,完成核函数的模板调用 + AscendC::ReduceOpAdapter( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy); + } else if (x.scalar_type() == at::kInt) { + // int 类型调用 + ... + } + return y; + } + // 调用 + namespace AscendC { + inline namespace reduce { + template + // 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 + void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy) + { + auto stream = c10_npu::getCurrentNPUStream().stream(false); + + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceSumCustom<<>>(x, y, param); + }else if { //其他policy逻辑 + ... + } + } + } // namespace reduce + } // namespace AscendC + ``` + - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py) + + ```python + # 引入头文件 + import torch + import torch_npu + import numpy as np + from torch_npu.testing.testcase import TestCase, run_tests + # 加载二进制 + torch.npu.config.allow_internal_format = False + torch.ops.load_library('./libascendc_pytorch.so') + + class TestAscendCOps(TestCase): + # 测试用例 + def test_reduce_sum_ops_float(self): + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float32) + npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,)) + cpuout = torch.sum(x, (0,)) + self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) + + # 测试用例调用 + if __name__ == '__main__': + run_tests() + ``` + +### 3. 基于ATVC编写pytorch算子的调用验证 + - 调用脚本,生成pytorch算子,并运行测试用例 + ```bash + $ cd ./atvc/examples/pytorch/reduce_sum + $ bash run.sh + ... + OK + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ---------------------------- | +| 2025/07/24 | 新增readme | diff --git a/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp b/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp new file mode 100644 index 0000000000000000000000000000000000000000..86c275e47b77584abefa7ba62085dc3b472ab165 --- /dev/null +++ b/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp @@ -0,0 +1,112 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "reduce_sum_impl.h" + +namespace AscendC { +inline namespace reduce { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy) +{ + auto stream = c10_npu::getCurrentNPUStream().stream(false); + + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceSumCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceSumCustom<<>>(x, y, param); + } +} +} // namespace reduce +} // namespace AscendC + +namespace ascendc_reduce_ops { +at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector &dim) +{ + std::vector shapeIn; + std::vector shapeOut; + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + for (int32_t size : x.sizes()) { + shapeIn.push_back(size); + shapeOut.push_back(size); + } + for (const auto &i : dim) { + shapeOut[i] = 1; + } + auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device()); + at::Tensor y = at::empty(shapeOut, options); + if (x.scalar_type() == at::kFloat) { + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, ¶m); + // 调用Adapter调度接口,完成核函数的模板调用 + AscendC::ReduceOpAdapter( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy); + } else if (x.scalar_type() == at::kInt) { + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcReduceTiling(shapeIn, dim, &policy, ¶m); + // 调用Adapter调度接口,完成核函数的模板调用 + AscendC::ReduceOpAdapter( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy); + } + return y; +} + +TORCH_LIBRARY(ascendc_ops, m) +{ + m.def("sum", &ascendc_reduce_ops::op_reduce_sum); +} +} // namespace ascendc_reduce_ops \ No newline at end of file diff --git a/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h b/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..670215c7ba865e50c966ae76ee1a26364138d0a4 --- /dev/null +++ b/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h @@ -0,0 +1,39 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef REDUCE_SUM_CUSTOM_IMPL_H +#define REDUCE_SUM_CUSTOM_IMPL_H +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" + +using ReduceOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; +using ReduceOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; + +/* + * 该函数为ReduceSumCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam ATVC::ReduceParam + */ +template +__global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, + Policy, + void, + void, + ATVC::ReduceParam*>(); + op.Run(x, y, &reduceParam); +} + +#endif diff --git a/atvc/examples/pytorch/reduce_sum/run.sh b/atvc/examples/pytorch/reduce_sum/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2650c7dc98f5ea7b22a49b1a57a1c9d9b64a5d0c --- /dev/null +++ b/atvc/examples/pytorch/reduce_sum/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +torch_location=$(python3 -c "import torch; print(torch.__path__[0])") +torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])") +python_include=$(python3 -c "import sysconfig; print(sysconfig.get_path('include'))") +python_lib=$(python3 -c "import sysconfig; print(sysconfig.get_path('stdlib'))") +export LD_LIBRARY_PATH=${torch_npu_location}/lib/:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=${torch_location}/lib/:$LD_LIBRARY_PATH +if [ -z "$ATVC_PATH" ]; then + atvc_path=$(realpath ../../../include) +else + atvc_path=$ATVC_PATH +fi + +rm -rf *.json +rm -rf libascendc_pytorch.so + + +bishengcc pytorch_ascendc_extension.cpp \ + -arch Ascend910B1 \ + -I${torch_location}/include \ + -I${torch_location}/include/torch/csrc/api/include \ + -I${python_include} \ + -I${atvc_path} \ + -I${torch_npu_location}/include \ + -L${torch_location}/lib \ + -L${torch_npu_location}/lib \ + -L${python_lib} \ + -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ + -o libascendc_pytorch.so \ + -shared + +python3 run_op.py + +if [ $? -ne 0 ]; then + echo "ERROR: verify result failed! the result is wrong!" + return 1 +fi + +rm -rf *.json +rm -rf libascendc_pytorch.so \ No newline at end of file diff --git a/atvc/examples/pytorch/reduce_sum/run_op.py b/atvc/examples/pytorch/reduce_sum/run_op.py new file mode 100644 index 0000000000000000000000000000000000000000..c433a3abeb8786f913116d6d64c3dc91755a8ff4 --- /dev/null +++ b/atvc/examples/pytorch/reduce_sum/run_op.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +# +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. + +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests + +torch.npu.config.allow_internal_format = False +torch.ops.load_library('./libascendc_pytorch.so') + + +class TestAscendCOps(TestCase): + + def test_reduce_sum_ops_float(self): + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float32) + npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,)) + cpuout = torch.sum(x, (0,)) + self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) + + def test_reduce_sum_ops_int(self): + length = [8, 2048] + x = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32) + npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,)) + cpuout = torch.sum(x, (0,), dtype=torch.int32) + self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) + +if __name__ == '__main__': + run_tests() \ No newline at end of file diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp index 6f19ab4964668cc9cd63744c46c2541b469fe20c..43b393022690ea2be7ec24ec5361963bc569c8b3 100644 --- a/atvc/examples/reduce_sum/reduce_sum.cpp +++ b/atvc/examples/reduce_sum/reduce_sum.cpp @@ -16,7 +16,8 @@ #include #include #include "acl/acl.h" -#include "atvc.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" #define CHECK_ACL(x) \ do { \ @@ -37,11 +38,26 @@ bool IsClose(float a, float b) float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); } -} // ReduceSum算子的描述:一个输入,一个输出,类型均为float using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} +} + /* * 该函数为ReduceCustom算子核函数入口 * x Device上的gm地址,指向Add算子第一个输入 @@ -155,7 +171,7 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::ReducePolicy policy; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 // Host侧调用Tiling API完成相关运行态参数的运算 if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m)) { printf("Reduce tiling error.\n"); @@ -178,14 +194,10 @@ int32_t main(int32_t argc, char* argv[]) CHECK_ACL(aclrtResetDevice(deviceId)); CHECK_ACL(aclFinalize()); - for (int32_t i = 0; i < outEleNum; i++) { - if (!IsClose(golden[i], outputY[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, golden[i], outputY[i]); - return -1; - } + if (!VerifyResults(golden, outputY)) { + return -1; } + printf("Accuracy verification passed.\n"); return 0; } diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp index b339ccbbcc8e0d741ea84d195274fc8918521277..e8e277b76e7117137579b4ab6880453d182c1e7d 100644 --- a/atvc/examples/sinh_custom/sinh_custom.cpp +++ b/atvc/examples/sinh_custom/sinh_custom.cpp @@ -16,7 +16,8 @@ #include #include #include "acl/acl.h" -#include "atvc.h" +#include "elewise/elewise_host.h" +#include "elewise/elewise_device.h" #define CHECK_ACL(x) \ do { \ @@ -37,7 +38,6 @@ bool IsClose(float a, float b) float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); } -} // 描述算子的输入输出以及临时计算资源 using SinhOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; @@ -64,6 +64,48 @@ struct SinhComputeFunc { } }; +void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &golden) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(1.0f, 10.0f); + + for (int i = 0; i < eleNum; ++i) { + inputX[i] = dis(gen); + golden[i] = std::sinh(inputX[i]); + } +} + +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} +} /* * 该函数为SinhCustom算子核函数入口 @@ -85,19 +127,11 @@ int main() int32_t eleNum = 8 * 2048; size_t inputByteSize = static_cast(eleNum) * sizeof(float); size_t outputByteSize = static_cast(eleNum) * sizeof(float); - - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(1.0f, 10.0f); std::vector inputX(eleNum); std::vector golden(eleNum); + InitializeData(eleNum, inputX, golden); - for (int i = 0; i < eleNum; ++i) { - inputX[i] = (dis(gen)); - golden[i] = std::sinh(inputX[i]); - } - printf("Generate golden data successfully.\n"); ATVC::EleWiseParam param; // 计算输入为8*2048个float元素的sinh算子的运行态参数param @@ -106,19 +140,15 @@ int main() return -1; }; // 初始化Acl资源与数据 - CHECK_ACL(aclInit(nullptr)); aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); + int32_t deviceId = 0; + InitializeACL(context, stream, deviceId); uint8_t *yHost; uint8_t *xDevice; uint8_t *yDevice; uint8_t *paramDevice; - CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); @@ -145,17 +175,10 @@ int main() CHECK_ACL(aclrtFree(paramDevice)); CHECK_ACL(aclrtFreeHost(yHost)); - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); + CleanACL(stream, deviceId); - for (int32_t i = 0; i < eleNum; i++) { - if (!IsClose(golden[i], outputY[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, golden[i], outputY[i]); - return -1; - } + if (!VerifyResults(golden, outputY)) { + return -1; } printf("Accuracy verification passed.\n"); return 0; diff --git a/atvc/include/broadcast/broadcast_device.h b/atvc/include/broadcast/broadcast_device.h new file mode 100644 index 0000000000000000000000000000000000000000..7d1f7775f4b696098b159f7321c4f6fa3fd4cede --- /dev/null +++ b/atvc/include/broadcast/broadcast_device.h @@ -0,0 +1,30 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + +/*! + * \file broadcast_device.h + * \brief + */ + +#ifndef ATVC_BROADCAST_DEVICE_H +#define ATVC_BROADCAST_DEVICE_H + +#include "common/atvc_opdef.h" +#include "common/const_def.h" +#include "common/kernel_utils.h" +#include "common/ops_utils_device.h" +#include "broadcast/common/broadcast_common.h" +#include "broadcast/broadcast_compute.h" +#include "broadcast/broadcast_op_template.h" +#include "kernel_operator.h" + +#endif // ATVC_BROADCAST_DEVICE_H \ No newline at end of file diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h index f9d777f5199fd1677c95affa8db46631af470d38..99cdc7d9834829374883145bc66b71be552f6e2b 100644 --- a/atvc/include/broadcast/broadcast_op_template.h +++ b/atvc/include/broadcast/broadcast_op_template.h @@ -13,7 +13,9 @@ #ifndef ATVC_BROADCAST_OP_TEMPLATE_H #define ATVC_BROADCAST_OP_TEMPLATE_H #include "common/const_def.h" +#include "common/ops_utils_device.h" #include "broadcast/broadcast_utils/broadcast_buf_pool.h" +#include "broadcast/broadcast_compute.h" namespace ATVC { struct BroadcastDataView { uint32_t dimASize; @@ -30,7 +32,7 @@ struct BroadcastDataView { }; namespace Kernel { -template +template class BroadcastOpTemplate { public: using DataType = typename BroadcastCompute::DataType; @@ -41,26 +43,27 @@ public: @param dst: 输出数据的gm指针 @broadcastParam: broadcast的动态参数,包含tiling data, workspace等 */ - __aicore__ inline void Run(GM_ADDR src, GM_ADDR dst, GM_ADDR broadcastParam) + template + __aicore__ inline void Run(GM_ADDR src, GM_ADDR dst, T1 broadcastParam) { this->Init(src, dst, broadcastParam); this->Process(); } private: - __aicore__ inline void Init(GM_ADDR src, GM_ADDR dst, GM_ADDR broadcastParam) + template + __aicore__ inline void Init(GM_ADDR src, GM_ADDR dst, T1 broadcastParam) { - param_ = reinterpret_cast<__gm__ BroadcastParam*>(broadcastParam); - tilingData_ = ¶m_->tilingData; - uint32_t srcDataSize = tilingData_->basicBlock; - uint32_t dstDataSize = tilingData_->basicBlock; + param_ = reinterpret_cast(broadcastParam); + uint32_t srcDataSize = this->param_->tilingData.basicBlock; + uint32_t dstDataSize = this->param_->tilingData.basicBlock; srcGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ DataType*>(src), srcDataSize); dstGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ DataType*>(dst), dstDataSize); bufPool_.template Init(GetTPipePtr(), ATVC::CONST2, // doublebuff需要的输入个数 ATVC::CONST2, // 计算结果的个数,一般与inputNum保持一致 - tilingData_->A2 * tilingData_->A12 * DATA_SIZE, // 输入Tensor大小 - tilingData_->A2 * tilingData_->B2 * DATA_SIZE); // 输出Tensor大小 + this->param_->tilingData.A2 * this->param_->tilingData.A12 * DATA_SIZE, // 输入Tensor大小 + this->param_->tilingData.A2 * this->param_->tilingData.B2 * DATA_SIZE); // 输出Tensor大小 } __aicore__ inline void CopyOutBatch(BroadcastDataView &view, @@ -72,12 +75,12 @@ private: for (int i = 0; i < view.B1; i++) { uint32_t copyOutOffset; if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) { - copyOutOffset = dimBCount * view.dimASize + dimACount * tilingData_->A2; + copyOutOffset = dimBCount * view.dimASize + dimACount * this->param_->tilingData.A2; } else { - copyOutOffset = dimACount * tilingData_->A2 * view.dimBSize + dimBCount; + copyOutOffset = dimACount * this->param_->tilingData.A2 * view.dimBSize + dimBCount; } CopyOut(output, copyOutOffset + view.copyOutBaseOffset, view); - dimBCount += tilingData_->B2; + dimBCount += this->param_->tilingData.B2; AscendC::PipeBarrier(); } } @@ -92,8 +95,8 @@ private: for (int i = 0; i < view.A11; i++) { inputOffset = 0; bufPool_.AllocTensor(input); - uint32_t copyInOffset = i * view.A12 * tilingData_->A2; - if (tilingData_->A0 != 1) { + uint32_t copyInOffset = i * view.A12 * this->param_->tilingData.A2; + if (this->param_->tilingData.A0 != 1) { copyInOffset += view.dimAOffset; } if (copyInOffset >= view.dimASize) { @@ -102,7 +105,7 @@ private: if (copyInOffset + view.copyInSize > view.dimASize) { // 剩下的数据不够一次完整计算, 根据实际数据重新计算 view.copyInSize = view.dimASize - copyInOffset; - view.A12 = OpsUtils::CeilDiv(view.copyInSize, tilingData_->A2); + view.A12 = OpsUtils::CeilDiv(view.copyInSize, this->param_->tilingData.A2); } CopyIn(input, copyInOffset, view); bufPool_.SetVecSync(input); @@ -112,14 +115,14 @@ private: bufPool_.AllocTensor(output); SyncDataQueue(); compute_.template Compute(input, inputOffset, output, - OpsUtils::CeilAlign(tilingData_->A2, UB_ALIGN_COUNT), - OpsUtils::CeilAlign(tilingData_->B2, UB_ALIGN_COUNT)); + OpsUtils::CeilAlign(this->param_->tilingData.A2, UB_ALIGN_COUNT), + OpsUtils::CeilAlign(this->param_->tilingData.B2, UB_ALIGN_COUNT)); bufPool_.SetCopyOutSync(output); bufPool_.WaitCopyOutSync(output); CopyOutBatch(view, dimACount, output); bufPool_.FreeTensor(output); dimACount++; - inputOffset += tilingData_->A2; + inputOffset += this->param_->tilingData.A2; SyncDataQueue(); } SyncDataQueue(); @@ -133,17 +136,17 @@ private: uint32_t copyOutBaseOffset = 0; // 计算拷出偏移基址 if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) { - if (tilingData_->A0 != 1) { // 核间A切分, 取部分A + if (this->param_->tilingData.A0 != 1) { // 核间A切分, 取部分A copyOutBaseOffset += view.dimAOffset; } - if (tilingData_->B0 != 1) { // 核间B切分,取部分B + if (this->param_->tilingData.B0 != 1) { // 核间B切分,取部分B copyOutBaseOffset += view.dimBOffset * view.dimASize; } } else { - if (tilingData_->A0 != 1) { // 核间A切分, 取部分A + if (this->param_->tilingData.A0 != 1) { // 核间A切分, 取部分A copyOutBaseOffset += view.dimAOffset * view.dimBSize; } - if (tilingData_->B0 != 1) { // 核间B切分,取部分B + if (this->param_->tilingData.B0 != 1) { // 核间B切分,取部分B copyOutBaseOffset += view.dimBOffset; } } @@ -153,34 +156,34 @@ private: __aicore__ inline void CalcView(BroadcastDataView &view) { if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) { - view.dimASize = tilingData_->dstShape[1]; - view.dimBSize = tilingData_->dstShape[0]; + view.dimASize = this->param_->tilingData.dstShape[1]; + view.dimBSize = this->param_->tilingData.dstShape[0]; view.inShape[0] = 1; - view.inShape[1] = tilingData_->A2; - view.outShape[0] = tilingData_->B2; - view.outShape[1] = tilingData_->A2; + view.inShape[1] = this->param_->tilingData.A2; + view.outShape[0] = this->param_->tilingData.B2; + view.outShape[1] = this->param_->tilingData.A2; } else { - view.dimASize = tilingData_->dstShape[0]; - view.dimBSize = tilingData_->dstShape[1]; - view.inShape[0] = tilingData_->A2; + view.dimASize = this->param_->tilingData.dstShape[0]; + view.dimBSize = this->param_->tilingData.dstShape[1]; + view.inShape[0] = this->param_->tilingData.A2; view.inShape[1] = 1; - view.outShape[0] = tilingData_->A2; - view.outShape[1] = tilingData_->B2; + view.outShape[0] = this->param_->tilingData.A2; + view.outShape[1] = this->param_->tilingData.B2; } - view.A11 = tilingData_->A11; - view.A12 = tilingData_->A12; - view.B1 = tilingData_->B1; + view.A11 = this->param_->tilingData.A11; + view.A12 = this->param_->tilingData.A12; + view.B1 = this->param_->tilingData.B1; uint32_t blockId = AscendC::GetBlockIdx(); - uint32_t dimAIdx = blockId / tilingData_->B0; - uint32_t dimBIdx = blockId % tilingData_->factorBTotalCnt; - view.dimAOffset = dimAIdx * tilingData_->factorACntPerCore; - view.dimBOffset = dimBIdx * tilingData_->factorBCntPerCore; + uint32_t dimAIdx = blockId / this->param_->tilingData.B0; + uint32_t dimBIdx = blockId % this->param_->tilingData.factorBTotalCnt; + view.dimAOffset = dimAIdx * this->param_->tilingData.factorACntPerCore; + view.dimBOffset = dimBIdx * this->param_->tilingData.factorBCntPerCore; // 计算一次计算的输入数据大小 - view.copyInSize = view.A12 * tilingData_->A2; // 一次拷贝A12份数据, for循环计算A12次 - if (view.dimAOffset + tilingData_->factorACntPerCore > view.dimASize) { + view.copyInSize = view.A12 * this->param_->tilingData.A2; // 一次拷贝A12份数据, for循环计算A12次 + if (view.dimAOffset + this->param_->tilingData.factorACntPerCore > view.dimASize) { // 剩下的A维度的数据不够每个核分到的A数目,重新计算实际的A维度切分 uint32_t realShape = view.dimASize - view.dimAOffset; - uint32_t A1 = OpsUtils::CeilDiv(realShape, tilingData_->A2); + uint32_t A1 = OpsUtils::CeilDiv(realShape, this->param_->tilingData.A2); if (A1 < view.A12) { view.A11 = 1; view.A12 = A1; @@ -188,9 +191,9 @@ private: view.A11 = OpsUtils::CeilDiv(A1, view.A12); } } - if (view.dimBOffset + tilingData_->factorBCntPerCore > view.dimBSize) { + if (view.dimBOffset + this->param_->tilingData.factorBCntPerCore > view.dimBSize) { uint32_t realShape = view.dimBSize - view.dimBOffset; - view.B1 = OpsUtils::CeilDiv(realShape, tilingData_->B2); + view.B1 = OpsUtils::CeilDiv(realShape, this->param_->tilingData.B2); } view.copyOutBaseOffset = CalcCopyOutBaseOffset(view); } @@ -211,7 +214,7 @@ private: { uint32_t blockId = AscendC::GetBlockIdx(); uint32_t dstDataSize = view.outShape[0] * view.outShape[1]; - uint64_t dstShape = tilingData_->dstShape[1]; + uint64_t dstShape = this->param_->tilingData.dstShape[1]; AscendC::DataCopyExtParams copyOutParams; copyOutParams.blockLen = view.outShape[1] * DATA_SIZE; copyOutParams.blockCount = dstDataSize * DATA_SIZE / copyOutParams.blockLen; @@ -222,9 +225,9 @@ private: copyOutParams.blockLen = (dstShape - copyOutOffset % dstShape) * DATA_SIZE; copyOutParams.srcStride = (copyOutParams.srcStride - copyOutParams.blockLen) / ATVC::UB_ALIGN_32; } - if (view.outShape[0] + copyOutOffset / dstShape > tilingData_->dstShape[0]) { + if (view.outShape[0] + copyOutOffset / dstShape > this->param_->tilingData.dstShape[0]) { // 行非对齐, 按实际数据拷贝 - copyOutParams.blockCount = (tilingData_->dstShape[0] - copyOutOffset / dstShape); + copyOutParams.blockCount = (this->param_->tilingData.dstShape[0] - copyOutOffset / dstShape); } copyOutParams.dstStride = dstShape * DATA_SIZE - copyOutParams.blockLen; AscendC::DataCopyPad(dstGlobal_[copyOutOffset], output, copyOutParams); @@ -238,8 +241,7 @@ private: AscendC::GlobalTensor srcGlobal_; AscendC::GlobalTensor dstGlobal_; BroadcastCompute compute_; - const __gm__ BroadcastParam *param_; - const __gm__ BroadcastOpTilingData *tilingData_; + ParamPtr param_; KernelUtils::BroadcastBufPool bufPool_; constexpr static uint32_t DATA_SIZE = sizeof(DataType); constexpr static uint32_t UB_ALIGN_COUNT = ATVC::UB_ALIGN_32 / DATA_SIZE; diff --git a/atvc/include/broadcast/common/broadcast_common.h b/atvc/include/broadcast/common/broadcast_common.h index 3f51b4e808a3188cedd287fc36df9d6a9b8e3d9e..fcdd0c0ff194596374ea5f58ba196f364fba1105 100644 --- a/atvc/include/broadcast/common/broadcast_common.h +++ b/atvc/include/broadcast/common/broadcast_common.h @@ -32,6 +32,8 @@ public: int32_t patternID = -1; int32_t loopABCount = -1; int32_t loopInnerABCount = -1; + constexpr BroadcastPolicy(int patternID_, int loopABCount_, int loopInnerABCount_): + patternID(patternID_), loopABCount(loopABCount_),loopInnerABCount(loopInnerABCount_){} bool operator==(const BroadcastPolicy& rhs) const { return this->patternID == rhs.patternID && this->loopABCount == rhs.loopABCount &&\ diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h index a054aaa3265a0b2958bde14c0f81ee5b3d138530..10dfc8fe7f8b333923e02407904fc145e099bc91 100644 --- a/atvc/include/broadcast/tiling/broadcast_tiling.h +++ b/atvc/include/broadcast/tiling/broadcast_tiling.h @@ -24,7 +24,7 @@ #include "graph/types.h" #include "common/const_def.h" #include "common/compile_info.h" -#include "common/ops_utils.h" +#include "common/ops_utils_host.h" namespace ATVC { @@ -32,6 +32,8 @@ struct BroadcastTilingInputParam { std::vector shapeIn; std::vector shapeOut; ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; + BroadcastTilingInputParam(std::vector in,std::vector out,ge::DataType inDtype): + shapeIn(in),shapeOut(out),inputDtype(inDtype){} }; } @@ -150,7 +152,7 @@ private: if (oriShapeIn[i] == 1 && oriShapeOut[i] != oriShapeIn[i]) { // B轴 if (!isCurB && haveB) { printf("[ERROR]Only support AB/BA!\n"); - return false; + return false; } if (!haveB) { shapeIn.emplace_back(oriShapeIn[i]); @@ -161,10 +163,10 @@ private: } isCurB = true; haveB = true; - }else { // A轴 + } else { // A轴 if (isCurB && haveA) { printf("[ERROR]Only support AB/BA!\n"); - return false; + return false; } if (!haveA) { shapeIn.emplace_back(oriShapeIn[i]); @@ -351,7 +353,7 @@ private: ATVC::BroadcastTilingInputParam opInput_; ATVC::BroadcastParam* param_ {nullptr}; ATVC::BroadcastPolicy* policy_ {nullptr}; - ATVC::OpCompileInfo compileInfo_; + ATVC::OpCompileInfo compileInfo_ = {0, 0, 0, 0}; }; } // namespace OpTiling #endif // ATVC_BROADCAST_TILING_H \ No newline at end of file diff --git a/atvc/include/common/compile_info.h b/atvc/include/common/compile_info.h index dc5479a76fd92c8f306e8c95cd1230d770d48ff8..f92f68aa479d8b89ed24cbc78bc0f026c8cba9a7 100644 --- a/atvc/include/common/compile_info.h +++ b/atvc/include/common/compile_info.h @@ -20,6 +20,12 @@ struct OpCompileInfo { uint64_t ubSize = 0; uint64_t cacheLineSize = 0; uint64_t ubBlockSize = 0; + OpCompileInfo(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + this->vectorCoreNum = a; + this->ubBlockSize = d; + this->ubSize = b; + this->cacheLineSize = c; + } }; inline OpCompileInfo GetOpCompileInfo() diff --git a/atvc/include/common/const_def.h b/atvc/include/common/const_def.h index 4ae60fa0d435c34e82d8891cf6798056e9b6a130..df6eb2fbd1bfbc9cdda64347dd6f649f784bcee8 100644 --- a/atvc/include/common/const_def.h +++ b/atvc/include/common/const_def.h @@ -13,6 +13,7 @@ #ifndef ATVC_COMMON_CONST_DEF_H #define ATVC_COMMON_CONST_DEF_H +#include namespace ATVC { constexpr int32_t UB_ALIGN_32 = 32; constexpr int32_t UB_ALIGN_31 = 31; diff --git a/atvc/include/common/ops_utils_device.h b/atvc/include/common/ops_utils_device.h new file mode 100644 index 0000000000000000000000000000000000000000..cacac400336b39340c30b91fca507f19c282500f --- /dev/null +++ b/atvc/include/common/ops_utils_device.h @@ -0,0 +1,77 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file ops_utils_device.h + * \brief + */ + +#ifndef ATVC_COMMON_OPS_UTILS_DEVICE_H +#define ATVC_COMMON_OPS_UTILS_DEVICE_H + +#include "kernel_operator.h" +namespace OpsUtils { +template +__aicore__ inline T Ceil(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b; +} + +template +__aicore__ inline T CeilAlign(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b * b; +} + +template +__aicore__ inline T CeilDiv(T a, T b) +{ + if (b == 0) { + return a; + } + return (a + b - 1) / b; +} + +template +__aicore__ inline T FloorDiv(T a, U b) +{ + if (b == 0) { + return a; + } + return a / b; +} + +template +__aicore__ inline T Aligned(T value, T alignment) +{ + if (alignment == 0) { + return value; + } + return (value + alignment - 1) / alignment * alignment; +} + +/** + * if align is 0, return 0 + */ +template +__aicore__ inline typename std::enable_if ::value, T>::type FloorAlign(T x, U align) { + return align == 0 ? 0 : x / align * align; +} + +} + +#endif // ATVC_COMMON_OPS_UTILS_DEVICE_H \ No newline at end of file diff --git a/atvc/include/common/ops_utils.h b/atvc/include/common/ops_utils_host.h similarity index 71% rename from atvc/include/common/ops_utils.h rename to atvc/include/common/ops_utils_host.h index efb33063196c3d9d098a42b8abd82e62dc103b9a..e92ff007f5f3a5b3c6d845df9a46f5c3b11885c3 100644 --- a/atvc/include/common/ops_utils.h +++ b/atvc/include/common/ops_utils_host.h @@ -10,29 +10,34 @@ */ /*! - * \file ops_utils.h + * \file ops_utils_host.h * \brief */ -#ifndef ATVC_COMMON_OPS_UTILS_H -#define ATVC_COMMON_OPS_UTILS_H +#ifndef ATVC_COMMON_OPS_UTILS_HOST_H +#define ATVC_COMMON_OPS_UTILS_HOST_H -#include "kernel_operator.h" namespace OpsUtils { template -__host_aicore__ inline T Ceil(T a, T b) +inline T Ceil(T a, T b) { + if (b == 0) { + return a; + } return (a + b - 1) / b; } template -__host_aicore__ inline T CeilAlign(T a, T b) +inline T CeilAlign(T a, T b) { + if (b == 0) { + return a; + } return (a + b - 1) / b * b; } template -__host_aicore__ inline T CeilDiv(T a, T b) +inline T CeilDiv(T a, T b) { if (b == 0) { return a; @@ -41,7 +46,7 @@ __host_aicore__ inline T CeilDiv(T a, T b) } template -__host_aicore__ inline T FloorDiv(T a, U b) +inline T FloorDiv(T a, U b) { if (b == 0) { return a; @@ -50,7 +55,7 @@ __host_aicore__ inline T FloorDiv(T a, U b) } template -__host_aicore__ inline T Aligned(T value, T alignment) +inline T Aligned(T value, T alignment) { if (alignment == 0) { return value; @@ -62,10 +67,10 @@ __host_aicore__ inline T Aligned(T value, T alignment) * if align is 0, return 0 */ template -__host_aicore__ inline typename std::enable_if ::value, T>::type FloorAlign(T x, U align) { +inline typename std::enable_if ::value, T>::type FloorAlign(T x, U align) { return align == 0 ? 0 : x / align * align; } } -#endif // ATVC_COMMON_OPS_UTILS_H \ No newline at end of file +#endif // ATVC_COMMON_OPS_UTILS_HOST_H \ No newline at end of file diff --git a/atvc/include/elewise/elewise_device.h b/atvc/include/elewise/elewise_device.h new file mode 100644 index 0000000000000000000000000000000000000000..de15983a34b1e3dcb4d4bdd9044c11a4fd45c6a7 --- /dev/null +++ b/atvc/include/elewise/elewise_device.h @@ -0,0 +1,28 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + +/*! + * \file elewise_device.h + * \brief + */ + +#ifndef ATVC_ELEWISE_DEVICE_H +#define ATVC_ELEWISE_DEVICE_H + +#include "common/atvc_opdef.h" +#include "common/const_def.h" +#include "common/kernel_utils.h" +#include "elewise/common/elewise_common.h" +#include "elewise/elewise_op_template.h" +#include "kernel_operator.h" + +#endif // ATVC_ELEWISE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/elewise/elewise_host.h b/atvc/include/elewise/elewise_host.h index cffb7f99ea4a7dd4f4fd9b04bea721ed6359c67b..3c64aaa90cc3fb2b25fdacf30254c2b4c69956ce 100644 --- a/atvc/include/elewise/elewise_host.h +++ b/atvc/include/elewise/elewise_host.h @@ -22,6 +22,7 @@ #include "common/compile_info.h" #include "common/atvc_opdef.h" #include "elewise/common/elewise_common.h" + namespace ATVC { namespace Host { namespace { diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h index ff1d05604181a42f1c725c740e84fd9f0eafeec9..2646597061718f5177ba87005b0dcfa40daf9dc4 100644 --- a/atvc/include/elewise/elewise_op_template.h +++ b/atvc/include/elewise/elewise_op_template.h @@ -23,7 +23,7 @@ namespace ATVC { namespace Kernel { -template +template class EleWiseOpTemplate { using EleWiseOpTraits = typename GetFunctionTraits::ComputeTraits; using Inputs = typename EleWiseOpTraits::In::types; @@ -64,7 +64,7 @@ private: FillOffsets(outOffsets_); FillOffsets(tempOffsets_); - this->param_ = reinterpret_cast<__gm__ ATVC::EleWiseParam*>(t0); + this->param_ = reinterpret_cast(t0); uint32_t curBlockId = AscendC::GetBlockIdx(); if (curBlockId < param_->tilingData.tailBlockCnt) { @@ -328,7 +328,7 @@ private: std::size_t tempOffsets_[TempCount]; // 计算得到的tiling数据 - __gm__ EleWiseParam* param_; + ParamPtr param_; uint32_t curCoreCnt_; uint32_t curCoreStartCnt_; diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index fca80f3fe2d28dc5a2dbdf9fc8877f3610c19758..309cc49ea42b4b1220a4d9cf05fa0c27a66aa7f8 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -76,16 +76,27 @@ namespace AR_COUNT { static constexpr uint32_t A5R9 = 59; }; +constexpr int32_t poly1 = 1000000; +constexpr int32_t poly2 = 1000; + struct ReducePolicy { public: int32_t patternID = -1; int32_t loopARCount = -1; int32_t loopInnerARCount = -1; + int32_t ID = -1; + constexpr ReducePolicy(int32_t patternID_, int32_t loopARCount_, int32_t loopInnerARCount_) + : patternID(patternID_), loopARCount(loopARCount_), loopInnerARCount(loopInnerARCount_), + ID(poly1 * patternID_ + poly2 * loopARCount_ + loopInnerARCount_) + {} bool operator==(const ReducePolicy& rhs) const { return this->patternID == rhs.patternID && this->loopARCount == rhs.loopARCount &&\ this->loopInnerARCount == rhs.loopInnerARCount; } + int32_t getID() { + return poly1 * this->patternID + poly2 * this->loopARCount + this->loopInnerARCount; + } }; static constexpr ReducePolicy REDUCE_POLICY0 { AR_PATTERN::A, AR_COUNT::A1R0, 0 }; @@ -134,6 +145,7 @@ struct ReduceParam { uint32_t workspaceSize = 0; // 申请空间大小 ReduceTilingData tilingData; // Reduce类算子的tiling数据 int32_t nBufferNum = 2; // 每个Queue中的Tensor数量 + int32_t policyId = -1; }; struct ReduceSchLoopInfo { diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h new file mode 100644 index 0000000000000000000000000000000000000000..3b862d04fdc9ac31362cf00332324e0f2a1721f5 --- /dev/null +++ b/atvc/include/reduce/reduce_device.h @@ -0,0 +1,31 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + +/*! + * \file reduce_device.h + * \brief + */ + +#ifndef ATVC_REDUCE_DEVICE_H +#define ATVC_REDUCE_DEVICE_H + +#include "common/atvc_opdef.h" +#include "common/const_def.h" +#include "reduce/common/reduce_common.h" +#include "kernel_operator.h" +#ifndef __ASCC_HOST__ +#include "common/kernel_utils.h" +#include "reduce/reduce_sum.h" +#include "reduce/reduce_op_template.h" +#endif + +#endif // ATVC_REDUCE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h index 479a98d9aaea6e8f461b323acbf8cc4edd59754e..7834a895aa671da6261ce9b84b718350356632a2 100644 --- a/atvc/include/reduce/reduce_host.h +++ b/atvc/include/reduce/reduce_host.h @@ -12,7 +12,10 @@ #define ATVC_REDUCE_HOST_H #include "common/atvc_opdef.h" #include "common/dtype_utils.h" +#include "common/const_def.h" +#include "reduce/common/reduce_common.h" #include "reduce/tiling/reduce_tiling.h" +#include "reduce/tiling/tiling_common.h" namespace ATVC { namespace Host { @@ -77,7 +80,7 @@ bool CalcReduceTiling(std::vector inputShape, printf("[ERROR] Reduce template does not support this data type!\n"); return false; } - ReduceTilingInputParam opInput = {reduceDim, inputShape, inputDtype, GetPromoteDataType(inputDtype)}; + OpTiling::ReduceTilingInputParam opInput = {reduceDim, inputShape, inputDtype, GetPromoteDataType(inputDtype)}; OpTiling::ReduceOpTiling tiling(opInput, policy, param); if (tiling.Run() != 0) { printf("[ERROR] Tiling Error\n"); diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index 17c747e6df7d218e8e8f659f62e76746aae3f870..7bdbb0600904bdaca6e31b4e3178b286251a8ff9 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -25,13 +25,14 @@ #include "reduce/reduce_utils/reduce_block_aux.h" #include "reduce/reduce_utils/reduce_util.h" #include "reduce/reduce_utils/reduce_buf_pool.h" +#include "reduce/common/reduce_common.h" namespace ATVC { namespace Kernel { template -class ReduceOpTemplate -{ + class PreCompute = void, class PostCompute = void, + typename ParamPtr = __gm__ ATVC::ReduceParam*> +class ReduceOpTemplate { public: constexpr static ReduceSchLoopInfo SchLoopInfo = KernelUtils::Reduce::GetSchLoopInfo(); using Pattern = typename ReducePattern::GetPattern::T; @@ -55,39 +56,15 @@ public: // 按照输入、输出、ReduceParam、其他标量的顺序传入 // 内部根据ReduceParam进行数据调度并调用ReduceOpTemplate完成计算后搬出到GM - template - __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, GM_ADDR param) { - param_ = reinterpret_cast<__gm__ ReduceParam*>(param); + template + __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, T1 param) { + param_ = reinterpret_cast(param); // 完成一些编译期的检查,比如PreCompute和PostCompute的In、Out个数是否与args的个数匹配 - tiling_ = ¶m_->tilingData; - Init((GM_ADDR)(param_->workspaceAddr), x, y); - Process(); - } -protected: - template - __aicore__ inline void InitArgsWorkspace(GM_ADDR workspace, Args... args) - { - workspace_.SetGlobalBuffer((__gm__ PromoteDataType*)workspace); - } - - template - __aicore__ inline void InitArgsOutput(GM_ADDR y, Args... args) - { - output_[start].SetGlobalBuffer((__gm__ DataType*)y); - if constexpr (start + 1 < OutputSize) { - InitArgsOutput(args...); - } - } - - template - __aicore__ inline void InitArgsInput(GM_ADDR x, Args... args) - { - input_[start].SetGlobalBuffer((__gm__ DataType*)x); - if constexpr (start + 1 < InputSize) { - InitArgsInput(args...); - } else { - InitArgsOutput<0>(args...); + if constexpr (AscendC::IsSameType::value) { + Process<__gm__ ATVC::ReduceTilingData>(); + } else if constexpr (AscendC::IsSameType::value) { + Process(); } } @@ -96,8 +73,8 @@ public: __aicore__ inline void Init(GM_ADDR workspace, Args... args) { pipe_ = GetTPipePtr(); - basicBlockLen_ = tiling_->basicBlock; - bufPool_.template Init(pipe_, TBufSize, PromoteBufSize, tiling_->basicBlock); + basicBlockLen_ = this->param_->tilingData.basicBlock; + bufPool_.template Init(pipe_, TBufSize, PromoteBufSize, this->param_->tilingData.basicBlock); InitArgsInput<0>(args...); InitArgsWorkspace(workspace); @@ -123,7 +100,7 @@ public: bufPool_.FreeTensor(tensor); } - template + template __aicore__ inline void Process(Args... args) { if constexpr (SelectReducePolicy.patternID == ATVC::AR_PATTERN::A) { @@ -131,16 +108,20 @@ public: return; } if constexpr (SchLoopInfo.loopRCount == 0) { - using SchTypeA = KernelUtils::Reduce::ReduceBlockAux<__gm__ ATVC::ReduceTilingData, &SchLoopInfo, std::remove_reference_t, DataType, - DataType, PreCompute, PostCompute>; + using SchTypeA = KernelUtils::Reduce::ReduceBlockAux< + T1, &SchLoopInfo, + std::remove_reference_t, DataType, DataType, + PreCompute, PostCompute>; - SchTypeA op(this, input_, output_, tiling_); + SchTypeA op(this, input_, output_, &(this->param_)->tilingData); op.Process(args...); } else { // 完成第一阶段的Reduce - using SchTypeR = KernelUtils::Reduce::ReduceBlockAux<__gm__ ATVC::ReduceTilingData, &SchLoopInfo, std::remove_reference_t, DataType, - PromoteDataType, PreCompute, void>; - SchTypeR op(this, input_, &workspace_, tiling_); + using SchTypeR = KernelUtils::Reduce::ReduceBlockAux< + T1, &SchLoopInfo, + std::remove_reference_t, DataType, + PromoteDataType, PreCompute, void>; + SchTypeR op(this, input_, &workspace_, &(this->param_)->tilingData); op.Process(args...); bufPool_.ResetEvent(); @@ -152,8 +133,10 @@ public: constexpr static ReduceSchLoopInfo groupSchLoopInfo = KernelUtils::Reduce::GetGroupSchLoopInfo(); ATVC::ReduceTilingData groupTiling; SetGroupTiling(groupTiling); - using SchTypeA = KernelUtils::Reduce::ReduceBlockAux, - PromoteDataType, DataType, void, PostCompute>; + using SchTypeA = KernelUtils::Reduce::ReduceBlockAux< + ATVC::ReduceTilingData, &groupSchLoopInfo, + std::remove_reference_t, PromoteDataType, + DataType, void, PostCompute>; SchTypeA groupOp(this, &workspace_, output_, &groupTiling); groupOp.Process(args...); } @@ -162,24 +145,26 @@ public: __aicore__ inline void SetGroupTiling(ATVC::ReduceTilingData& groupTiling) { groupTiling.ubFactorA = ELEMENT_ONE_REPEAT_COMPUTE; - groupTiling.ubFactorR = tiling_->groupR; - groupTiling.shape[0] = tiling_->groupR; - groupTiling.shape[1] = tiling_->outSize; - groupTiling.stride[0] = tiling_->outSize; + groupTiling.ubFactorR = this->param_->tilingData.groupR; + groupTiling.shape[0] = this->param_->tilingData.groupR; + groupTiling.shape[1] = this->param_->tilingData.outSize; + groupTiling.stride[0] = this->param_->tilingData.outSize; groupTiling.stride[1] = 1; - groupTiling.dstStride[0] = tiling_->outSize; + groupTiling.dstStride[0] = this->param_->tilingData.outSize; groupTiling.dstStride[1] = 1; groupTiling.groupR = 1; - groupTiling.outSize = tiling_->outSize; + groupTiling.outSize = this->param_->tilingData.outSize; groupTiling.factorRCntPerCore = 1; groupTiling.factorRTotalCnt = 1; groupTiling.factorATotalCnt = OpsUtils::CeilDiv(groupTiling.shape[1], groupTiling.ubFactorA); groupTiling.factorACntPerCore = OpsUtils::CeilDiv(groupTiling.factorATotalCnt, - static_cast(64)); // 按照64核计算,需要tiling传 + static_cast(64)); // 按照64核计算,需要tiling传 } template - __aicore__ inline void CopyInAux(const AscendC::GlobalTensor& src, U& view, V& shape, const AscendC::LocalTensor& ubTensor) + __aicore__ inline void CopyInAux(const AscendC::GlobalTensor &src, + U &view, V &shape, + const AscendC::LocalTensor &ubTensor) { T paddingValue = compute_.template GetPaddingValue(); uint8_t padCnt = ((view.axis[0].dstStride - view.burstLen) * sizeof(T)) % BLOCK_SIZE_BYTE / sizeof(T); @@ -197,34 +182,45 @@ public: (view.axis[0].dstStride - view.burstLen) * sizeof(T) / BLOCK_SIZE_BYTE; // unit block(32byte) bufPool_.SyncTensor(ubTensor); - for (int32_t i = 0; i < view.axis[CONST6].repeat; i++) { - for (int32_t j = 0; j < view.axis[CONST5].repeat; j++) { - for (int32_t k = 0; k < view.axis[CONST4].repeat; k++) { - for (int32_t l = 0; l < view.axis[CONST3].repeat; l++) { - for (int32_t m = 0; m < view.axis[CONST2].repeat; m++) { - for (int32_t n = 0; n < view.axis[CONST1].repeat; n++) { - int64_t dstStride = i * view.axis[CONST6].dstStride + j * view.axis[CONST5].dstStride + - k * view.axis[CONST4].dstStride + l * view.axis[CONST3].dstStride + - m * view.axis[CONST2].dstStride + n * view.axis[CONST1].dstStride; - int64_t srcStride = i * view.axis[CONST6].srcStride + j * view.axis[CONST5].srcStride + - k * view.axis[CONST4].srcStride + l * view.axis[CONST3].srcStride + - m * view.axis[CONST2].srcStride + n * view.axis[CONST1].srcStride; - AscendC::DataCopyPad(ubTensor[dstStride], src[view.addr + srcStride], copyInParams, padParams); - } - } - } - } + const int32_t repeats[CONST6] = {static_cast(view.axis[CONST1].repeat), + static_cast(view.axis[CONST2].repeat), static_cast(view.axis[CONST3].repeat), + static_cast(view.axis[CONST4].repeat), static_cast(view.axis[CONST5].repeat), + static_cast(view.axis[CONST6].repeat)}; + const int32_t dstStrides[CONST6] = {static_cast(view.axis[CONST1].dstStride), + static_cast(view.axis[CONST2].dstStride), static_cast(view.axis[CONST3].dstStride), + static_cast(view.axis[CONST4].dstStride), static_cast(view.axis[CONST5].dstStride), + static_cast(view.axis[CONST6].dstStride)}; + const int32_t srcStrides[CONST6] = {static_cast(view.axis[CONST1].srcStride), + static_cast(view.axis[CONST2].srcStride), static_cast(view.axis[CONST3].srcStride), + static_cast(view.axis[CONST4].srcStride), static_cast(view.axis[CONST5].srcStride), + static_cast(view.axis[CONST6].srcStride)}; + + int32_t total = 1; + for (int32_t i = 0; i < CONST6; ++i) + total *= repeats[i]; + + for (int32_t idx = 0; idx < total; ++idx) { + int32_t tmp = idx; + int32_t dstOffset = 0; + int32_t srcOffset = 0; + for (int32_t axis = 0; axis < CONST6; ++axis) { + int32_t coord = tmp % repeats[axis]; + tmp /= repeats[axis]; + dstOffset += coord * dstStrides[axis]; + srcOffset += coord * srcStrides[axis]; } + AscendC::DataCopyPad(ubTensor[dstOffset], src[view.addr + srcOffset], copyInParams, padParams); } } - __aicore__ inline void CopyInput2Output() { + __aicore__ inline void CopyInput2Output() + { uint32_t shapeSize = 1; for (uint8_t i = 0; i < MAX_DIM; i++) { - if (tiling_->shape[i] <= 1) { + if (this->param_->tilingData.shape[i] <= 1) { break; } - shapeSize = shapeSize * tiling_->shape[i]; + shapeSize = shapeSize * this->param_->tilingData.shape[i]; } shapeSize = (shapeSize * sizeof(DataType) + UB_ALIGN_31) / UB_ALIGN_32 * UB_ALIGN_32 / sizeof(DataType); AscendC::LocalTensor tmpLoc; @@ -287,8 +283,36 @@ public: AscendC::DataCopyPad(dst[view.addr], tempBuf_[tmpBufOffest], copyOutParams); AscendC::PipeBarrier(); } + +protected: + template + __aicore__ inline void InitArgsWorkspace(GM_ADDR workspace, Args... args) + { + workspace_.SetGlobalBuffer((__gm__ PromoteDataType*)workspace); + } + + template + __aicore__ inline void InitArgsOutput(GM_ADDR y, Args... args) + { + output_[start].SetGlobalBuffer((__gm__ DataType*)y); + if constexpr (start + 1 < OutputSize) { + InitArgsOutput(args...); + } + } + + template + __aicore__ inline void InitArgsInput(GM_ADDR x, Args... args) + { + input_[start].SetGlobalBuffer((__gm__ DataType*)x); + if constexpr (start + 1 < InputSize) { + InitArgsInput(args...); + } else { + InitArgsOutput<0>(args...); + } + } + private: - __gm__ ReduceParam* param_; // CalcReduceTiling API计算出的运行态参数 + ParamPtr param_; // CalcReduceTiling API计算出的运行态参数 AscendC::TPipe* pipe_; AscendC::TBuf<> oriVecQue_; AscendC::TBuf<> tempResQue_; @@ -302,10 +326,9 @@ private: AscendC::GlobalTensor output_[OutputSize]; AscendC::GlobalTensor input_[InputSize]; AscendC::GlobalTensor workspace_; - - AscendC::LocalTensor tempUb_; - const __gm__ ATVC::ReduceTilingData* tiling_; - + + AscendC::LocalTensor tempUb_; + int64_t basicBlockLen_; int64_t oriBasicBlockLen_; diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index e5d36a877918a470febf610229712e74ffb24542..d806251313cf876602993d4335c76b212cab62e9 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -16,6 +16,20 @@ #include "reduce/common/patterns.h" #include "reduce/reduce_utils/reduce_block_aux_util.h" +namespace { +struct ReduceARParam { + uint32_t repStride = 0; + uint16_t dimA = 0; + uint16_t dimMax = 0; + uint16_t mainR = 0; + uint16_t tailR = 0; + uint64_t maskAddRNum = 0; + uint16_t loopRNum = 0; + uint16_t dtypeSize = 0; + uint16_t dimR = 0; +}; +} + namespace ATVC { // OpTraits: 算子描述的ATVC::OpTraits结构体 @@ -30,17 +44,22 @@ public: __aicore__ inline ReduceSumCompute() {} template - __aicore__ inline void Compute(KernelUtils::Shape<2>& shape, const AscendC::LocalTensor& dst, const AscendC::LocalTensor& src) + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) { // AR场景,硬件限制,R轴需要做UB上32B对齐,对齐方式有2种: // 1. 高性能对齐(补充元素值不确定), 后续累加计算只能计算实际有效的元素个数 // 2. 补0对齐(补值是由用户实现的GetPaddingValue()接口决定的) if (std::is_same::value) { if constexpr (needMask) { // 1. 高性能对齐模式 - int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); // MainR(int64_t dimR, bool isAR): 框架提供的计算R轴二分长度(元素个数), dimR为原始的元素个数 + // MainR(int64_t dimR, bool isAR): 框架提供的计算R轴二分长度(元素个数), dimR为原始的元素个数 + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); } else { - int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); // MainR:框架提供的计算R轴二分长度(元素个数),dimR为补齐后的元素个数 + // MainR:框架提供的计算R轴二分长度(元素个数),dimR为补齐后的元素个数 + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); } } @@ -50,20 +69,24 @@ public: } } - __aicore__ inline void ReduceRA(const AscendC::LocalTensor& dst, const AscendC::LocalTensor& src, uint16_t dimA, uint16_t dimR, - uint16_t mainR) + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) { uint32_t totalNum = dimR * dimA; uint32_t mainNum = dimA * mainR; uint32_t dtypeSize = sizeof(PrompteDtype); uint32_t tailNum = totalNum - mainNum; - uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; // add mask最大值为256 bytes 且要满足32bytes对齐 + // add mask最大值为256 bytes 且要满足32bytes对齐 + uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; // 处理tail uint16_t repeatTimes = tailNum / maskAddNum; uint16_t repeatNum = repeatTimes * maskAddNum; uint16_t repTailNum = tailNum - repeatNum; uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长 - AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); if (repeatTimes > 0) { AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); } @@ -77,9 +100,9 @@ public: AscendC::PipeBarrier(); // 二分主体 uint16_t loopRNum = mainR; - while(loopRNum > 1) { + while (loopRNum > 1) { loopRNum = loopRNum >> 1; - mainNum = loopRNum * dimA;// LoopR的前半部分数据量 + mainNum = loopRNum * dimA; // LoopR的前半部分数据量 repeatTimes = mainNum / maskAddNum; repeatNum = repeatTimes * maskAddNum; repTailNum = mainNum - repeatNum; @@ -102,8 +125,10 @@ public: AscendC::DataCopy(dst, src, dimA); } - __aicore__ inline void ReduceAR(const AscendC::LocalTensor& dstTensor, const AscendC::LocalTensor& srcTensor, uint16_t dimA, uint16_t dimR, - uint16_t mainR, uint64_t oriBurstLen) + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) { uint16_t tailR = oriBurstLen - mainR; uint16_t dtypeSize = sizeof(PrompteDtype); @@ -111,74 +136,31 @@ public: uint16_t dimMax = dimA * dimR; uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + if (mainR > 0 && tailR > 0) { - uint16_t addRTotalNum = tailR / maskAddRNum * maskAddRNum; - uint16_t addRTail = tailR - addRTotalNum; - AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride - if (repStride > UB_ALIGN_255) { - for (uint16_t i = 0; i < dimMax; i += dimR) { - AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + mainR], tailR); - } - } else { - for (uint16_t i = 0; i < addRTotalNum; i += maskAddRNum) { - AscendC::Add(srcTensor[i], srcTensor[i + mainR], srcTensor[i], maskAddRNum, dimA, repeatParams); - } - if (addRTail > 0) { - AscendC::Add(srcTensor[addRTotalNum], srcTensor[addRTotalNum + mainR], srcTensor[addRTotalNum], addRTail, dimA, repeatParams); - } - } - AscendC::PipeBarrier(); + PerformInitialAdd(srcTensor, param); } // 二分计算 - uint16_t loopRNum = mainR; - while(loopRNum > maskAddRNum) { - loopRNum = loopRNum / 2; // 除2二分 - if (repStride > UB_ALIGN_255) { - for (uint16_t i = 0; i < dimMax; i += dimR) { - AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + loopRNum], loopRNum); - } - } else { - uint16_t addRTotalNum = loopRNum / maskAddRNum * maskAddRNum; - uint16_t addRTail = loopRNum -addRTotalNum; - AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride - for (uint16_t i = 0; i < addRTotalNum; i += maskAddRNum) { - AscendC::Add(srcTensor[i], srcTensor[i + loopRNum], srcTensor[i], maskAddRNum, dimA, repeatParams); - } - if (addRTail > 0) { - AscendC::Add(srcTensor[addRTotalNum], srcTensor[addRTotalNum], srcTensor[addRTotalNum + loopRNum], addRTail, dimA, repeatParams); - } - } - AscendC::PipeBarrier(); - } - if (loopRNum == 0) { // small shape, 直接reduce - loopRNum = tailR; + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2; // 除2二分 + PerformBinaryReduction(srcTensor, param); } - if constexpr ( AscendC::IsSameType::value || AscendC::IsSameType::value) { - uint16_t reduceLoopTimes = UB_ALIGN_255 * dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / dtypeSize; - for (uint16_t dimAIdx = 0; dimAIdx < dimA; dimAIdx += reduceLoopTimes) { // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 - uint16_t curDimA = (dimAIdx + reduceLoopTimes < dimA) ? reduceLoopTimes : dimA - dimAIdx; - AscendC::WholeReduceSum(dstTensor[dimAIdx], srcTensor[dimAIdx * dimR], loopRNum, curDimA, 1, 1, repStride); - } - AscendC::PipeBarrier(); - } else if constexpr ( AscendC::IsSameType::value || AscendC::IsSameType::value) { - maskAddRNum = UB_ALIGN_32 / dtypeSize; - // 尽量二分add到最后32bytes - // int32 -> float 都是4字,一把cast 用CAST_NONE - AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); - AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); - AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, dimA *dimR); - AscendC::PipeBarrier(); - uint16_t reduceLoopTimes = 255 * dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / dtypeSize; - for (uint16_t dimAIdx = 0; dimAIdx < dimA; dimAIdx += reduceLoopTimes) { // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 - uint16_t curDimA = (dimAIdx + reduceLoopTimes < dimA) ? reduceLoopTimes : dimA - dimAIdx; - AscendC::WholeReduceSum(interpreDst[dimAIdx], interpreSrc[dimAIdx * dimR], loopRNum, curDimA, 1, 1, repStride); - } - AscendC::PipeBarrier(); - AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); - } else { - return; + if (param.loopRNum == 0) { // small shape, 直接reduce + param.loopRNum = tailR; } + PerformFinalReduction(dstTensor, srcTensor, param); } template @@ -202,15 +184,19 @@ public: for (uint16_t i = 0; i < outerLoopTimes; ++i) { // outerLoopTimes是dimA的大小 uint32_t srcIdx = i * outerLoopStride; for (uint16_t j = 0; j < innerLoopTimes; ++j) { - AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx], dstTensor[srcIdx + j * innerLoopStride], outerLoopStride); + AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); AscendC::PipeBarrier(); } DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); } } - __aicore__ inline void ReduceBetweenUB(const AscendC::LocalTensor& ubTensorLeft, - const AscendC::LocalTensor& ubTensorRight, const int32_t& calCount) + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) { Add(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); } @@ -221,6 +207,95 @@ public: U paddingValue = 0; // 由于ReduceSum是累加R轴数据,补齐的元素值设为0,才能保证累加的结果不受影响 return paddingValue; } + +private: + __aicore__ inline void PerformInitialAdd(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Add(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Add(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Add(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Add(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceSum( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // 尽量二分add到最后32bytes + // int32 -> float 都是4字,一把cast 用CAST_NONE + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceSum( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } }; } // namespace ATVC diff --git a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h index 612b1d0e3a5eccb8a935f6232715eec6f843154f..adcd122d0e9ad47826790e0ba01d289402c5dd26 100644 --- a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h +++ b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h @@ -19,8 +19,9 @@ #include "reduce_util.h" #include "kernel_operator.h" #include "common/platform.h" -#include "common/ops_utils.h" +#include "common/ops_utils_device.h" #include "reduce/common/patterns.h" +#include "reduce/common/reduce_common.h" namespace ATVC { diff --git a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h index 01c63adaaa0391efa278c47d1dfb3d10790cdb5f..f27ec468f374b11d7d1f389fa156a327cb71a5fa 100644 --- a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h +++ b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h @@ -19,7 +19,7 @@ #include "kernel_operator.h" #include "common/platform.h" -#include "common/ops_utils.h" +#include "common/ops_utils_device.h" namespace ATVC { namespace KernelUtils { diff --git a/atvc/include/reduce/reduce_utils/reduce_util.h b/atvc/include/reduce/reduce_utils/reduce_util.h index 082cad8732ed5a36bf7e41febf869c641df85e82..556f2867ae4367a9a20c164643fb8c3f8de047a5 100644 --- a/atvc/include/reduce/reduce_utils/reduce_util.h +++ b/atvc/include/reduce/reduce_utils/reduce_util.h @@ -27,20 +27,20 @@ struct Shape { int64_t oriBurstLen; }; -template +template __aicore__ inline constexpr int32_t GetCopyInCount() { - if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { + if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { return CONST2; } else { return CONST3; } } -template +template __aicore__ inline constexpr int32_t GetComputeCount() { - if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { + if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { return CONST2; } else { return CONST0; diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h index f649485c8e7db518c5383d53222893b16a653bcf..92d90df69dbb3db49d1a6a78af342e9818ca06fa 100644 --- a/atvc/include/reduce/tiling/reduce_tiling.h +++ b/atvc/include/reduce/tiling/reduce_tiling.h @@ -17,58 +17,10 @@ #include "graph/types.h" #include "common/compile_info.h" #include "common/const_def.h" -#include "reduce/common/patterns.h" -#include "common/ops_utils.h" - -namespace { -constexpr static int32_t BASIC_BLOCK = 48 * 1024; -constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce -constexpr static int32_t MAX_INNER_A = 128; -constexpr static double THRES_HOLD = 0.85; -constexpr static int32_t A_STEP_LEN = 4; - -struct ReduceTilingUnit { - int32_t idx = -1; // ub cut axis - uint64_t inner = 1; // inner size in ub - uint64_t outer = 1; // outer size of ub - uint64_t step = 1; // step of cacheline - void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step) - { - this->idx = idx; - this->inner = inner; - this->outer = outer; - this->step = step; - } -}; - -struct CacheLineBlock { - int32_t axis = -1; // cacheline cut axis - uint64_t size = 1; // cacheline size - uint64_t cacheLineStep = 1; // cacheline cut size for axis - uint64_t cacheLineOuter = 1; // relative to cacheLineStep, out size of cacheline cut axis - uint64_t aSize = 1; // A axis size in cacheline -}; - -struct ReduceTilingInputParam { - std::vector reduceDim = {}; - std::vector reduceShape = {}; - ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; - ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED; -}; -} +#include "common/ops_utils_host.h" +#include "tiling_common.h" namespace OpTiling { -static void MakeWrapDim(const std::vector& shape, std::vector& axes) -{ - // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0 - size_t shapeSize = shape.size(); - for (size_t i = 0; i < axes.size(); i++) { - if (axes[i] < 0) { - axes[i] += shapeSize; - } - } -} - class ReduceOpTiling { public: ReduceOpTiling(ReduceTilingInputParam& inputParam, @@ -157,24 +109,6 @@ void CalcWorkSpace() param_->workspaceSize = ATVC::WORKSPACE_SIZE + spaceSize; } -int32_t IsAxesValid(const std::vector& shape, const std::vector& axes) -{ - size_t shapeSize = shape.size(); - size_t axesSize = axes.size(); - if (axesSize > shapeSize) { - printf("[ERROR] axis size is greater than shape size\n"); - return -1; - }; - - for (size_t i = 0; i < axesSize; i++) { - if (axes[i] >= static_cast(shapeSize) || axes[i] < 0) { - printf("[ERROR] axis size incorrect \n"); - return -1; - }; - } - return 0; -} - void EliminateOne(const std::vector& oriShape, std::vector& axes, std::vector& shape, int32_t& shapeSize) { @@ -324,16 +258,6 @@ void CalcBasicBlock() } } -template -bool IsEmtpyTensor(const std::vector& shape) -{ - for (int32_t i = 0; i < Pattern::Dim; i++) { - if (shape[i] == 0) { - return true; - } - } - return false; -} template bool ComputeEmptyTiling(std::vector& shape) { @@ -442,15 +366,6 @@ void ComputeSplit(std::vector& shape) OpsUtils::CeilDiv(unitR_.outer, factorRCntPerCore); param_->tilingData.coreNum = realCore; } -template -bool IsAxisA(int32_t idx) -{ - if (Pattern::FirstA) { - return idx % ATVC::CONST2 == 0; - } else { - return idx % ATVC::CONST2 == 1; - } -} template bool CalcCacheLineStep(const std::vector& shape) @@ -593,7 +508,7 @@ private: int32_t basicBlock_ = 0; ATVC::ReduceParam* param_ {nullptr}; ATVC::ReducePolicy* policy_ {nullptr}; - ATVC::OpCompileInfo compileInfo_; + ATVC::OpCompileInfo compileInfo_ = {0, 0, 0, 0}; CacheLineBlock cBlock_; ReduceTilingUnit unitA_; ReduceTilingUnit unitR_; diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h new file mode 100644 index 0000000000000000000000000000000000000000..2f96757a6026c739c14ebc007d9bf9112ebf433f --- /dev/null +++ b/atvc/include/reduce/tiling/tiling_common.h @@ -0,0 +1,107 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + + #ifndef ATVC_TILING_COMMON_H +#define ATVC_TILING_COMMON_H +#include "reduce/common/patterns.h" + +namespace OpTiling { +constexpr static int32_t BASIC_BLOCK = 48 * 1024; +constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce +constexpr static int32_t MAX_INNER_A = 128; +constexpr static double THRES_HOLD = 0.85; +constexpr static int32_t A_STEP_LEN = 4; + +struct ReduceTilingUnit { + int32_t idx = -1; // ub cut axis + uint64_t inner = 1; // inner size in ub + uint64_t outer = 1; // outer size of ub + uint64_t step = 1; // step of cacheline + void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step) + { + this->idx = idx; + this->inner = inner; + this->outer = outer; + this->step = step; + } +}; + +struct CacheLineBlock { + int32_t axis = -1; // cacheline cut axis + uint64_t size = 1; // cacheline size + uint64_t cacheLineStep = 1; // cacheline cut size for axis + uint64_t cacheLineOuter = 1; // relative to cacheLineStep, out size of cacheline cut axis + uint64_t aSize = 1; // A axis size in cacheline +}; + +struct ReduceTilingInputParam { + std::vector reduceDim = {}; + std::vector reduceShape = {}; + ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; + ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED; + ReduceTilingInputParam(std::vector reduceDim_, std::vector reduceShape_, + ge::DataType inputDtype_, ge::DataType promoteDtpye_): + reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_){} +}; + +void MakeWrapDim(const std::vector& shape, std::vector& axes) +{ + // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0 + size_t shapeSize = shape.size(); + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] < 0) { + axes[i] += shapeSize; + } + } +} + +template +bool IsAxisA(int32_t idx) +{ + if (Pattern::FirstA) { + return idx % ATVC::CONST2 == 0; + } else { + return idx % ATVC::CONST2 == 1; + } +} + +int32_t IsAxesValid(const std::vector& shape, const std::vector& axes) +{ + size_t shapeSize = shape.size(); + size_t axesSize = axes.size(); + if (axesSize > shapeSize) { + printf("[ERROR] axis size is greater than shape size\n"); + return -1; + }; + + for (size_t i = 0; i < axesSize; i++) { + if (axes[i] >= static_cast(shapeSize) || axes[i] < 0) { + printf("[ERROR] axis size incorrect \n"); + return -1; + }; + } + return 0; +} + +template +bool IsEmtpyTensor(const std::vector& shape) +{ + for (int32_t i = 0; i < Pattern::Dim; i++) { + if (shape[i] == 0) { + return true; + } + } + return false; +} + +}; // namespace OpTiling + +#endif // ATVC_TILING_COMMON_H \ No newline at end of file diff --git a/atvc/tests/run_test.sh b/atvc/tests/run_test.sh index 5c291c0eb16777075aec3e95e577266aae0b9856..667e24b6d14d8b883d253c477e896f8ab198fea6 100644 --- a/atvc/tests/run_test.sh +++ b/atvc/tests/run_test.sh @@ -23,7 +23,7 @@ else fi ATVC_HOME_DIR=$CURRENT_DIR/../ -TEST_CASE_LIST="add add_with_scalar reduce_sum sinh_custom broadcast_to" +TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples|xargs) if [ $# -ne 1 ]; then echo "This script takes only one test case name as input. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'" exit 1