diff --git a/atvc/README.md b/atvc/README.md
index ce4a1b871c216db6d824acd28afd10d273fb965c..620e28c13366d2afc4f0cb10eedbcaa8d9ce3b78 100644
--- a/atvc/README.md
+++ b/atvc/README.md
@@ -72,15 +72,17 @@ Accuracy verification passed.
 | Reduce模板 |
 | Broadcast模板 |
 # 样例介绍
-| 样例名                                                         | 描述                                                         |
-| ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [add](./examples/add/add.cpp) | 使用ATVC的Ele-wise模板实现Add算子以及调用样例               |
-| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Ele-wise类算子以及调用样例               |
-| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Ele-wise类算子以及调用样例              |
-| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例               |
-| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例               |
-
-更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md)。
+| 样例名                                                         | 描述                                                         | 类型|
+| ------------------------------------------------------------ | ------------------------------------------------------------ |------------------------------------------------------------ |
+| [add](./examples/add/add.cpp) | 使用ATVC的Ele-wise模板实现Add算子以及调用样例               | 直调算子 |
+| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Ele-wise类算子以及调用样例               | 直调算子 |
+| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Ele-wise类算子以及调用样例              | 直调算子 |
+| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例               | 直调算子 |
+| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例               | 直调算子 |
+| [ops_aclnn](./examples/aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例               | 自定义工程算子 |
+| [ops_pytorch](./examples/pytorch) | 使用ATVC基于[pytorch](https://gitee.com/ascend/pytorch)算子的实现以及调用样例                 | pytorch算子 |
+
+更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md)，其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to是ATVC的直调样例，ops_aclnn为基于ATVC对接aclnn工程的算子目录,ops_pytorch为基于ATVC对接pytorch工程的算子目录。
 
 
 
diff --git a/atvc/docs/2_developer_guide.md b/atvc/docs/2_developer_guide.md
index 412e4ce5f3d19ca80d62dd7490023721d7ef0f94..c236bd991ca57a6d0eeb9c3dbf3ac16bb9aa81c3 100644
--- a/atvc/docs/2_developer_guide.md
+++ b/atvc/docs/2_developer_guide.md
@@ -86,7 +86,7 @@ EleWiseKernel<<<param.tilingData.blockNum, nullptr, stream>>>(x, y, z, paramDevi
 以下为Reduce算子开发场景中`ATVC::ReducePolicy`参与计算的伪代码，详细过程请参考<b>2.2.5 Host层API</b>：
 ```cpp
 // 声明policy和param变量
-ATVC::ReducePolicy policy;
+ATVC::ReducePolicy policy = {-1, -1, -1};
 ATVC::ReduceParam param;
 // OpTraits为算子描述原型，CalcReduceTiling API负责计算出该场景最佳数据搬运策略param以及最佳模板算子实现对应的policy
 ATVC::Host::CalcReduceTiling<OpTraits>(..., &policy, &param);
@@ -671,7 +671,7 @@ int32_t main(int32_t argc, char* argv[])
     std::vector<int64_t> dim{0};
     std::vector<int64_t> shape{8, 1024};
     ATVC::ReduceParam param;
-    ATVC::ReducePolicy policy;
+    ATVC::ReducePolicy policy = {-1, -1, -1};
     if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &param, &policy)) {
         printf("Reduce tiling error.");
         return -1;
@@ -727,7 +727,7 @@ using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<flo
 {
     // ...
     ATVC::ReduceParam param;
-    ATVC::ReducePolicy policy;
+    ATVC::ReducePolicy policy = {-1, -1, -1};
     if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(inputShape, reduceDim, &policy, &param))
     {
         printf("Reduce tiling error.");
@@ -901,7 +901,7 @@ int32_t main(int32_t argc, char* argv[])
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
     ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::ReducePolicy policy;  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param)) {
         printf("Reduce tiling error.");
@@ -1153,7 +1153,7 @@ int32_t main(int32_t argc, char* argv[])
     std::vector<int64_t> shapeIn{1, 1024};    // 测试输入shape
     std::vector<int64_t> shapeOut{8, 1024};    // 测试输入shape
     ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::BroadcastPolicy policy;  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
         printf("Broadcast tiling error.\n");
@@ -1200,7 +1200,7 @@ int32_t main(int32_t argc, char* argv[])
     // acl资源初始化
     ...
     ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::BroadcastPolicy policy;  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     (void)ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param);
 
@@ -1323,7 +1323,7 @@ int32_t main(int32_t argc, char* argv[])
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
     ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::BroadcastPolicy policy;  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
         printf("Broadcast tiling error.\n");
diff --git a/atvc/examples/aclnn/README.md b/atvc/examples/aclnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35df784ff75fa94f0c7b5109539eac6065ffaae7
--- /dev/null
+++ b/atvc/examples/aclnn/README.md
@@ -0,0 +1,296 @@
+## 概述
+使用ATVC对接ACLNN工程简单的示例，适合初学者。
+
+## 自定义算子样例说明
+样例通过Ascend C编程语言实现了ATVC框架对接自定义算子工程，并按照算子调用方式分别给出了对应的端到端实现。
+
+## 算子开发样例
+|  目录名称                                                   |  功能描述                                               |
+| ------------------------------------------------------------ | ---------------------------------------------------- |
+| [add](./add) | 基于ATVC框架的Add自定义Vector算子及AclNNInvocation调用样例 |
+| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子及AclNNInvocation调用样例 |
+
+## 快速上手<a name="codeready"></a>
+
+ 快速执行example用例，更详细的流程请参阅[add算子](../add/README.md)。
+
+- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/1_quick_start.md)。 
+
+- 导入ATVC环境变量
+  ```bash
+  # 如果不导入，默认使用./atvc/include路径
+  export ATVC_PATH=${atvc}/include
+  ```
+
+ - 执行add用例
+   
+   ```bash
+    # 基于ATVC编译自定义Add算子
+    $ cd ./atvc/examples/aclnn/add
+    # 以910B1为例，运行命令如下：
+    $ bash install.sh -v Ascend910B1
+    # 安装custom包
+    $ cd CustomOp/build_out
+    $ ./custom_opp*.run
+    # 样例运行
+    $ cd ./atvc/examples/aclnn/add/AclNNInvocationNaive
+    $ bash run.sh
+    ...
+    test pass
+   ```
+
+## 基于ATVC框架支持自定义算子
+### 步骤1. 生成自定义工程基础目录及文件
+  参考[msopgen](https://www.hiascend.com/document/detail/zh/mindstudio/81RC1/ODtools/Operatordevelopmenttools/atlasopdev_16_0021.html)创建算子工程的基础文件。 
+  ```bash 
+    rm -rf CustomOp
+    # Generate the op framework
+    msopgen gen -i AddCustom.json -c ai_core-Ascend910B1 -lan cpp -out CustomOp
+  ```
+  生成目录结构如下：
+  ```
+    CustomOp
+    ├── build.sh         // 编译入口脚本
+    ├── cmake 
+    │   ├── config.cmake
+    │   ├── util        // 算子工程编译所需脚本及公共编译文件存放目录
+    ├── CMakeLists.txt   // 算子工程的CMakeLists.txt
+    ├── CMakePresets.json // 编译配置项
+    ├── framework        // 算子插件实现文件目录，单算子模型文件的生成不依赖算子适配插件，无需关注
+    ├── op_host                      // Host侧实现文件
+    │   ├── add_custom_tiling.h    // 算子tiling定义文件
+    │   ├── add_custom.cpp         // 算子原型注册、shape推导、信息库、tiling实现等内容文件
+    │   ├── CMakeLists.txt
+    ├── op_kernel                   // Kernel侧实现文件
+    │   ├── CMakeLists.txt   
+    │   ├── add_custom.cpp        // 算子代码实现文件 
+    ├── scripts                     // 自定义算子工程打包相关脚本所在目录
+  ```
+
+### 步骤2. 修改对应文件内容
+  
+- 2.1 复制需要的配置文件
+
+  将[func.cmake](./add/AddCustom/cmake/func.cmake)、host侧的[CMakeLists.txt](./add/AddCustom/op_host/CMakeLists.txt)和kernel侧的[CMakeLists.txt](./add/AddCustom/op_kernel/CMakeLists.txt)分别复制到`步骤1`生成的工程文件的对应目录下。
+ 
+- 2.2 修改对应的host文件
+  - 引入对应的头文件，修改对应TilingFunc函数中tiling的生成，根据算子类型调用不同的tiling生成策略,更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。
+
+    elewise类,参考[add_custom.cpp](./add/AddCustom/op_host/add_custom.cpp)
+    ```cpp
+      // 引入头文件
+      #include "elewise/elewise_host.h"
+      ...      
+      //定义算子描述
+      using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
+      using AddOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t, int32_t>, ATVC::OpOutputs<int32_t>>;
+
+      // 修改对应TilingFunc
+      // 声明运行态参数tiling
+      ATVC::EleWiseParam *tiling = context->GetTilingData<ATVC::EleWiseParam>();
+      uint32_t totleLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
+      // 根据不同数据类型使用不同的算子描述
+      if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) {
+          // AddOpTraitsFloat为ADD算子描述原型，根据算子输入输出个数和实际元素数量计算出Tiling数据后填入tiling中
+          (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totleLength, *tiling);
+      } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) {
+          (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsInt>(totleLength, *tiling);
+      }
+      // 设置tilingkey
+      context->SetTilingKey(0);
+      // 设置blockDim的大小
+      context->SetBlockDim(tiling->tilingData.blockNum);
+      // 设置Workspace的大小
+      size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+      currentWorkspace[0] = 0;
+      ...
+    ```
+   
+    broadcast类
+    ```cpp
+      #include "broadcast/broadcast_host.h"
+      // 定义算子描述
+      using BroadcastOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+      using BroadcastOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
+      ...
+      // 修改对应TilingFunc
+      // 获取输入输出shape
+      std::vector<int64_t> shapeIn;
+      std::vector<int64_t> shapeOut;
+      for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) {
+          shapeIn.push_back(inputShape0.GetDim(i));
+      }
+      for (int32_t i = 0; i < outputShape0.GetDimNum(); i++) {
+          shapeOut.push_back(outputShape0.GetDim(i));
+      }
+      // 声明运行态参数tiling
+      ATVC::BroadcastParam *tiling = context->GetTilingData<ATVC::BroadcastParam>();
+      ATVC::BroadcastPolicy policy = {-1, -1, -1};
+      // 根据不同数据类型使用不同的算子描述
+      if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) {
+        // BroadcastOpTraitsFloat为Reduce算子描述原型，根据算子输入shape和dim计算出Tiling数据后填入tiling中
+        (void)ATVC::Host::CalcBroadcastTiling<BroadcastOpTraitsFloat>(shapeIn, shapeOut, &policy, tiling);
+      } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) {
+        (void)ATVC::Host::CalcBroadcastTiling<BroadcastOpTraitsInt>(shapeIn, shapeOut, &policy, tiling);
+      }
+      // 根据不同的policy设置不同的tilingkey,在kernel侧根据不同的tilingkey进行调用不同的算子模版
+      if (policy == ATVC::BROADCAST_POLICY0) {
+        context->SetTilingKey(0);
+      } else if (policy == ATVC::BROADCAST_POLICY1) {
+        context->SetTilingKey(1);
+      }
+      // 设置blockDim
+      context->SetBlockDim(tiling->tilingData.coreNum);
+      size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+      currentWorkspace[0] = 0;
+    ```
+
+    reduce_sum类,参考[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp)
+    ```cpp
+      // 引入头文件
+      #include "reduce/reduce_host.h"
+      // 定义算子描述
+      using ReduceOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+      using ReduceOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
+      ...
+      // 修改对应TilingFunc
+      ATVC::ReducePolicy policy = {0, 0, 0};
+      auto inputShape0 = context->GetInputShape(0)->GetOriginShape();
+      std::vector<int64_t> shapeIn;
+      for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) {
+          shapeIn.push_back(inputShape0.GetDim(i));
+      }
+      // 获取dim值
+      const gert::RuntimeAttrs *runtimeAttrs = context->GetAttrs();
+      const gert::TypedContinuousVector<int64_t> *attr0 = runtimeAttrs->GetListInt(0);
+      const int64_t *arr = reinterpret_cast<const int64_t *>(attr0->GetData());
+      std::vector<int64_t> dim(arr, arr + attr0->GetSize());
+
+      // 声明运行态参数tiling
+      ATVC::ReduceParam *tiling = context->GetTilingData<ATVC::ReduceParam>();
+      // 根据不同数据类型使用不同的算子描述
+      if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) {
+        // ReduceOpTraitsFloat为Reduce算子描述原型，根据算子输入shape和dim计算出Tiling数据后填入tiling中
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsFloat>(shapeIn, dim, &policy, tiling);
+      } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) {
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsInt>(shapeIn, dim, &policy, tiling);
+      }
+      // 设置policyId，作为kernel的分支判断
+      tiling->policyId = policy.getID();
+      // 设置blockDim
+      context->SetBlockDim(tiling->tilingData.coreNum);
+    ```
+ - 2.3 修改对应的kernel文件
+  
+    用户需要通过AscendC API来搭建Add算子的核心计算逻辑，在ATVC框架中，这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数，并重载`operator()`来被提供的Kernel层算子模板类调用，更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。
+ 
+    elewise类[add_custom.cpp](./add/AddCustom/op_kernel/add_custom.cpp)
+    ```cpp
+      // 头文件引入
+      #include "elewise/elewise_device.h"
+      ...
+      // 定义算子描述
+      using AddOpTraits = ATVC::OpTraits<ATVC::OpInputs<DTYPE_X, DTYPE_Y>, ATVC::OpOutputs<DTYPE_Z>>;
+
+      ...
+      // 新增 AddComputeFunc
+      // 传入编译态参数ATVC::OpTraits
+      template<typename Traits>
+      struct AddComputeFunc {
+          /*
+          函数说明： z = x + y
+          参数说明：
+              x                   : 参与运算的输入
+              y                   : 参与运算的输入
+              z                   : 参与运算的输出
+          */
+          template<typename T> 
+          // 重载operator，提供给算子模板类调用
+          __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
+              AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量
+          }
+      };
+
+      // 修改核函数文件的实现
+      KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+      REGISTER_TILING_DEFAULT(ATVC::EleWiseParam);
+      auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<AddOpTraits>>();
+      op.Run(x, y, z, tiling);
+    ```    
+    broadcast类
+    ```cpp
+      // 头文件引入
+      #include "broadcast/broadcast_device.h"
+      // 定义算子描述
+      using BroadcastOpTraits = ATVC::OpTraits<ATVC::OpInputs<DTYPE_X>, ATVC::OpOutputs<DTYPE_Y>>;
+      ...
+
+      // 修改核函数文件
+        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+        REGISTER_TILING_DEFAULT(ATVC::BroadcastParam);
+        // broadcast有不同的policy，在host与tilingkey进行绑定，此处的调用使用TILING_KEY_IS进行判断，和host的文件SetTilingKey相对应
+        if (TILING_KEY_IS(0)) {
+          auto op = ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<BroadcastOpTraits>, ATVC::BROADCAST_POLICY0>();
+          op.Run(x, y, tiling);
+        }else{
+          ...
+        }
+    ```
+    reduce_sum类[reduce_sum_custom.cpp](./reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp)
+    ```cpp
+      // 头文件引入
+      #include "reduce/reduce_device.h"
+      // 定义算子描述
+      using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<DTYPE_X>, ATVC::OpOutputs<DTYPE_Y>>;
+      ...
+
+      // 修改核函数文件
+        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);
+        REGISTER_TILING_DEFAULT(ATVC::ReduceParam);         
+        GET_TILING_DATA(param, tiling);
+        if (param.policyId == ATVC::REDUCE_POLICY0.ID) {
+            auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY0>();
+            op.Run(x, y, tiling);
+        } else {
+          // 根据不同的tiling.policyId进行判断不同ReduceOpTemplate初始化
+           ...
+        }
+    ```
+    此处未使用`TILING_KEY_IS`进行分支判断，因为policy分支过多，在使用`tilingKey`进行判断的时候，会有爆栈的问题，此时建议使用`param.policyId`进行分支的判断。
+
+### 步骤3. 算子工程编译
+
+  在算子工程目录下执行如下命令，进行算子工程编译。  
+  ```bash
+  $ cd CustomOp
+  $ bash build.sh
+  ```
+脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_<target os>_<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+### 步骤4. 部署自定义算子包
+
+  部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+
+  ```bash 
+  echo $ASCEND_OPP_PATH
+  # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+  # 若没有，则需导出CANN环境变量
+  source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+  # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+  ```
+参数说明：
+
+  ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。
+
+  执行如下命令，安装自定义算子包。
+  ```bash
+  cd build_out
+  ./custom_opp_<target os>_<target architecture>.run
+  ```
+
+### 步骤5. 调用执行算子工程 
+
+    算子文件编写完成，参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证
\ No newline at end of file
diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt b/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..57a2c66376526b6a80f9d407e3f4e13d78022104
--- /dev/null
+++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_add)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER)
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ${INC_PATH}/include
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_executable(execute_add_op
+    main.cpp
+)
+
+target_link_libraries(execute_add_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md b/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e826fd573f773eb04fa6ade96590eadc9c750501
--- /dev/null
+++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/README.md
@@ -0,0 +1,52 @@
+## 概述
+本样例相比于AclNNInvocation样例工程，简化了工程配置。
+## 目录结构介绍
+```
+├── AclNNInvocationNaive
+│   ├── CMakeLists.txt      // 编译规则文件
+│   ├── main.cpp            // 单算子调用应用的入口
+│   └── run.sh              // 编译运行算子的脚本
+```
+## 代码实现介绍
+完成自定义算子的开发部署后，可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子，无需提供单算子描述文件进行离线模型的转换，直接调用单算子API接口。
+
+自定义算子编译部署后，会自动生成单算子API，可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”，形如：
+   ```cpp
+   // 获取算子使用的workspace空间大小
+   aclnnStatus aclnnAddCustomGetWorkspaceSize(const aclTensor *x, const aclTensor *y, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor);
+   // 执行算子
+   aclnnStatus aclnnAddCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream);
+   ```
+其中`aclnnAddCustomGetWorkspaceSize`为第一段接口，主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的`workspace`大小之后，开发者按照workspaceSize大小申请Device侧内存，然后调用第二段接口`aclnnAddCustom`执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。
+## 运行样例算子
+### 1. 编译算子工程
+运行此样例前，请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。
+### 2. aclnn调用样例运行
+
+  - 进入到样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd atvc/examples/aclnn/add/AclNNInvocationNaive
+    ```
+  - 样例编译文件修改
+
+    将CMakeLists.txt文件内"/usr/local/Ascend/ascend-toolkit/latest"替换为CANN软件包安装后的实际路径。  
+    eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest
+
+  - 环境变量配置
+
+    需要设置NPU_HOST_LIB环境变量，以x86为例
+    ```bash
+    export NPU_HOST_LIB=/home/HwHiAiUser/Ascend/ascend-toolkit/latest/x86_64-linux/lib64
+    ```
+  - 样例执行
+  
+    用户参考run.sh脚本进行编译与运行。
+    ```bash
+    bash run.sh
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/22 | 新增本readme |
\ No newline at end of file
diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp b/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..694a61ce29a8f7997123c84f2b93127240a1eb7b
--- /dev/null
+++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/main.cpp
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+#include <random>
+
+#include "acl/acl.h"
+#include "aclnn_add_custom.h"
+namespace {
+#define CHECK_RET(cond, return_expr) \
+    do {                             \
+        if (!(cond)) {               \
+            return_expr;             \
+        }                            \
+    } while (0)
+
+#define LOG_PRINT(message, ...)         \
+    do {                                \
+        printf(message, ##__VA_ARGS__); \
+    } while (0)
+
+int64_t GetShapeSize(const std::vector<int64_t> &shape)
+{
+    int64_t shapeSize = 1;
+    for (auto i : shape) {
+        shapeSize *= i;
+    }
+    return shapeSize;
+}
+
+int Init(int32_t deviceId, aclrtStream *stream)
+{
+    // Fixed code, acl initialization
+    auto ret = aclInit(nullptr);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return 1);
+    ret = aclrtSetDevice(deviceId);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return 1);
+    ret = aclrtCreateStream(stream);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return 1);
+
+    return 0;
+}
+
+template <typename T>
+int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
+                    aclDataType dataType, aclTensor **tensor)
+{
+    auto size = GetShapeSize(shape) * sizeof(T);
+    // Call aclrtMalloc to allocate device memory
+    auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return 1);
+
+    // Call aclrtMemcpy to copy host data to device memory
+    ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return 1);
+
+    // Call aclCreateTensor to create a aclTensor object
+    *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(),
+                              shape.size(), *deviceAddr);
+    return 0;
+}
+
+void DestroyResources(std::vector<void *> tensors, std::vector<void *> deviceAddrs, aclrtStream stream,
+                      int32_t deviceId, void *workspaceAddr = nullptr)
+{
+    // Release aclTensor and device
+    for (uint32_t i = 0; i < tensors.size(); i++) {
+        if (tensors[i] != nullptr) {
+            aclDestroyTensor(reinterpret_cast<aclTensor *>(tensors[i]));
+        }
+        if (deviceAddrs[i] != nullptr) {
+            aclrtFree(deviceAddrs[i]);
+        }
+    }
+    if (workspaceAddr != nullptr) {
+        aclrtFree(workspaceAddr);
+    }
+    // Destroy stream and reset device
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+}
+
+void InitializeData(std::vector<float> &inputX, std::vector<float> &inputY, std::vector<float> &inputZ,
+ std::vector<float> &golden, std::vector<int64_t> &shape)
+{
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0f, 100.0f);
+
+    for (int i = 0; i < shape[0] * shape[1]; ++i) {
+        inputX[i] = dis(gen);
+        inputY[i] = dis(gen);
+        golden[i] = inputX[i] + inputY[i];
+        inputZ[i] = 0.0;
+    }
+}
+
+bool VerifyResults(const std::vector<float> &goldenData, const std::vector<float> &resultData)
+{
+    int64_t len = 10;
+    LOG_PRINT("result is:\n");
+    for (int64_t i = 0; i < len; i++) {
+        LOG_PRINT("%.1f ", resultData[i]);
+    }
+    LOG_PRINT("\n");
+    if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) {
+        LOG_PRINT("test pass\n");
+    } else {
+        LOG_PRINT("test failed\n");
+        return false;
+    }
+    return true;
+}
+}
+
+int main(int argc, char **argv)
+{
+    // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl
+    // Update deviceId to your own device id
+    int32_t deviceId = 0;
+    aclrtStream stream;
+    auto ret = Init(deviceId, &stream);
+    CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return 1);
+
+    // 2. Create input and output, need to customize according to the interface of the API
+    std::vector<int64_t> shape = {8, 2048}; 
+    void *inputXDeviceAddr = nullptr;
+    void *inputYDeviceAddr = nullptr;
+    void *outputZDeviceAddr = nullptr;
+    aclTensor *inputX = nullptr;
+    aclTensor *inputY = nullptr;
+    aclTensor *outputZ = nullptr;
+    std::vector<float> inputXHostData(shape[0] * shape[1]);
+    std::vector<float> inputYHostData(shape[0] * shape[1]);
+    std::vector<float> outputZHostData(shape[0] * shape[1]);
+    std::vector<float> goldenData(shape[0] * shape[1]); 
+
+    InitializeData(inputXHostData, inputYHostData, outputZHostData, goldenData, shape);
+    std::vector<void *> tensors = {inputX, inputY, outputZ};
+    std::vector<void *> deviceAddrs = {inputXDeviceAddr, inputYDeviceAddr, outputZDeviceAddr};
+    // Create inputX aclTensor
+    ret = CreateAclTensor(inputXHostData, shape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+    // Create inputY aclTensor
+    ret = CreateAclTensor(inputYHostData, shape, &inputYDeviceAddr, aclDataType::ACL_FLOAT, &inputY);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+    // Create outputZ aclTensor
+    ret = CreateAclTensor(outputZHostData, shape, &outputZDeviceAddr, aclDataType::ACL_FLOAT, &outputZ);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+
+    // 3. Call the API of the custom operator library
+    uint64_t workspaceSize = 0;
+    aclOpExecutor *executor;
+    // Calculate the workspace size and allocate memory for it
+    ret = aclnnAddCustomGetWorkspaceSize(inputX, inputY, outputZ, &workspaceSize, &executor);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddCustomGetWorkspaceSize failed. ERROR: %d\n", ret);
+              DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+
+    void *workspaceAddr = nullptr;
+    if (workspaceSize > 0U) {
+        ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+    }
+    // Execute the custom operator
+    ret = aclnnAddCustom(workspaceAddr, workspaceSize, executor, stream);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 4. (Fixed code) Synchronize and wait for the task to complete
+    ret = aclrtSynchronizeStream(stream);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the
+    // interface of the API
+    auto size = GetShapeSize(shape);
+    std::vector<float> resultData(size, 0);
+    ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr,
+                      size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 6. Detroy resources, need to modify according to the interface of the API
+    DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr);
+
+    // print the output result
+    if (!VerifyResults(goldenData, resultData)) {        
+        return -1;
+    }
+    return 0;
+}
diff --git a/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh b/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7abad744e9245564546a2b97d8b1bba68b659dd5
--- /dev/null
+++ b/atvc/examples/aclnn/add/AclNNInvocationNaive/run.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export DDK_PATH=$_ASCEND_INSTALL_PATH
+export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib
+
+set -e
+rm -rf build
+mkdir -p build
+cmake -B build -DCMAKE_SKIP_RPATH=TRUE
+cmake --build build -j
+(
+    cd build
+    export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH
+    ./execute_add_op
+)
diff --git a/atvc/examples/aclnn/add/AddCustom.json b/atvc/examples/aclnn/add/AddCustom.json
new file mode 100644
index 0000000000000000000000000000000000000000..f55a31b9bb3761f2b2a4bc7e8c152aac6e1511b6
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom.json
@@ -0,0 +1,46 @@
+[
+    {
+        "op": "AddCustom",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "x",
+                "param_type": "required",
+                "format": [
+                    "ND",
+                    "ND"
+                ],
+                "type": [
+                    "float32",
+                    "int32"
+                ]
+            },
+            {
+                "name": "y",
+                "param_type": "required",
+                "format": [
+                    "ND",
+                    "ND"
+                ],
+                "type": [
+                    "float32",
+                    "int32"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "z",
+                "param_type": "required",
+                "format": [
+                    "ND",
+                    "ND"
+                ],
+                "type": [
+                    "float32",
+                    "int32"
+                ]
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7e1f43e5a2ed42c2ee3395308378e61eb15606f
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/CMakeLists.txt
@@ -0,0 +1,76 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+
+include(cmake/config.cmake)
+include(cmake/func.cmake)
+include(cmake/intf.cmake)
+
+if(ENABLE_CROSS_COMPILE)
+    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64)
+        set(CROSS_COMPILE_PLATFORM aarch64)
+    else()
+        set(CROSS_COMPILE_PLATFORM x86_64)
+    endif()
+    set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR})
+    set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/)
+    set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/)
+    if(CMAKE_CROSS_LIBRARY_PATH)
+        set(CMAKE_COMPILE_COMPILER_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH})
+        set(CMAKE_COMPILE_RUNTIME_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH})
+    endif()
+    set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM})
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+    set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER})
+else()
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
+    add_subdirectory(framework)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
+    add_subdirectory(op_host)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
+    add_subdirectory(op_kernel)
+endif()
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# modify vendor_name in install.sh and upgrade.sh
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/
+    COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/*
+)
+add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(gen_version_info ALL
+        COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)
+
+    install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
+            DESTINATION packages/vendors/${vendor_name}/)
+
+    # CPack config
+    set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+    set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+    set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+    set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+    set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+    set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
+    set(CPACK_GENERATOR External)
+    set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+    set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+    set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+    set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
+    include(CPack)
+endif()
diff --git a/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake b/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e234f442aedc54b40f2d557d5ead27d7e8d880f5
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/cmake/func.cmake
@@ -0,0 +1,377 @@
+include(ExternalProject)
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH  $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+function(get_system_info SYSTEM_INFO)
+  if (UNIX)
+    execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP)
+    string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP})
+    set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE)
+  elseif (WIN32)
+    message(STATUS "System is Windows. Only for pre-build.")
+  else ()
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.")
+  endif ()
+endfunction()
+
+function(opbuild)
+  message(STATUS "Opbuild generating sources")
+  cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX;ENABLE_SOURCE" "OPS_SRC" ${ARGN})
+  execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++17 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0
+                  -I ${ASCEND_CANN_PACKAGE_PATH}/include -I ${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel
+                  -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api -I ${ATVC_PATH}
+                  -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("build ops lib info: ${EXEC_INFO}")
+    message("build ops lib error: ${EXEC_ERROR}")
+    message(FATAL_ERROR "opbuild run failed!")
+  endif()
+  set(proj_env "")
+  set(prefix_env "")
+  if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x")
+    set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}")
+  endif()
+  if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x")
+    set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}")
+  endif()
+
+  set(ENV{ENABLE_SOURCE_PACAKGE} ${OPBUILD_ENABLE_SOURCE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT vendor_name)
+      message(FATAL_ERROR "ERROR: vendor_name is invalid!")
+      return()
+    endif()
+    set(ENV{ASCEND_VENDOR_NAME} ${vendor_name})
+    set(ENV{OPS_PRODUCT_NAME} ${ASCEND_COMPUTE_UNIT})
+    set(ENV{SYSTEM_PROCESSOR} ${CMAKE_SYSTEM_PROCESSOR})
+  endif()
+  execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+                          ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  unset(ENV{ENABLE_SOURCE_PACAKGE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    unset(ENV{ASCEND_VENDOR_NAME})
+    unset(ENV{OPS_PRODUCT_NAME})
+    unset(ENV{SYSTEM_PROCESSOR})
+  endif()
+  if (${EXEC_RESULT})
+    message("opbuild ops info: ${EXEC_INFO}")
+    message("opbuild ops error: ${EXEC_ERROR}")
+  endif()
+  message(STATUS "Opbuild generating sources - done")
+endfunction()
+
+function(add_ops_info_target)
+  cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY)
+  add_custom_command(OUTPUT ${OPINFO_OUTPUT}
+      COMMAND mkdir -p ${opinfo_file_path}
+      COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py
+              ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT}
+  )
+  add_custom_target(${OPINFO_TARGET} ALL
+      DEPENDS ${OPINFO_OUTPUT}
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${OPINFO_OUTPUT}
+            DESTINATION ${OPINFO_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_ops_compile_options OP_TYPE)
+  cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN})
+  execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_gen_options.py
+                          ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} ${OP_TYPE} ${OP_COMPILE_COMPUTE_UNIT}
+                          ${OP_COMPILE_OPTIONS}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR)
+  if (${EXEC_RESULT})
+      message("add ops compile options info: ${EXEC_INFO}")
+      message("add ops compile options error: ${EXEC_ERROR}")
+      message(FATAL_ERROR "add ops compile options failed!")
+  endif()
+endfunction()
+
+function(add_npu_support_target)
+  cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY)
+  add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+    COMMAND mkdir -p ${NPUSUP_OUT_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh
+            ${NPUSUP_OPS_INFO_DIR}
+            ${NPUSUP_OUT_DIR}
+  )
+  add_custom_target(npu_supported_ops ALL
+    DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+      DESTINATION ${NPUSUP_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_simple_kernel_compile)
+  set(options "")
+  set(single_value_args "OPS_INFO;OUT_DIR;TILING_LIB;OP_TYPE;SRC;COMPUTE_UNIT;JSON_FILE;DYNAMIC_PATH")
+  set(multi_value_args "OPTIONS;CONFIGS")
+  cmake_parse_arguments(BINCMP "${options}" "${single_value_args}" "${multi_value_args}" ${ARGN})
+  if (NOT DEFINED BINCMP_OUT_DIR)
+    set(BINCMP_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary)
+  endif()
+  if (NOT DEFINED BINCMP_TILING_LIB)
+    set(BINCMP_TILING_LIB $<TARGET_FILE:cust_optiling>)
+  endif()
+  if (${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT TARGET op_kernel_pack)
+      add_custom_target(op_kernel_pack
+                        COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_kernel.py
+                        --input-path=${BINCMP_OUT_DIR}
+                        --output-path=${BINCMP_OUT_DIR}/library
+                        --enable-library=${ASCEND_PACK_SHARED_LIBRARY}
+                        --platform=${CMAKE_SYSTEM_PROCESSOR})
+      add_library(ascend_kernels INTERFACE)
+      target_link_libraries(ascend_kernels INTERFACE kernels)
+      target_link_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      target_include_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      add_dependencies(ascend_kernels op_kernel_pack)
+      add_dependencies(op_kernel_pack ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+    endif()
+  endif()
+  # add Environment Variable Configurations of ccache
+  set(_ASCENDC_ENV_VAR)
+  if(${CMAKE_CXX_COMPILER_LAUNCHER} MATCHES "ccache$")
+    list(APPEND _ASCENDC_ENV_VAR export ASCENDC_CCACHE_EXECUTABLE=${CMAKE_CXX_COMPILER_LAUNCHER} &&)
+  endif()
+
+  if (NOT DEFINED BINCMP_OPS_INFO)
+    set(BINCMP_OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${BINCMP_COMPUTE_UNIT}-ops-info.ini)
+  endif()
+  if (NOT ${ENABLE_CROSS_COMPILE})
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${BINCMP_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+    add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} cust_optiling)
+  else()
+    if (${ENABLE_BINARY_PACKAGE} AND NOT DEFINED HOST_NATIVE_TILING_LIB)
+      message(FATAL_ERROR "Native host libs was not set for cross compile!")
+    endif()
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${HOST_NATIVE_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+  endif()
+  add_dependencies(ascendc_bin_${BINCMP_COMPUTE_UNIT}_gen_ops_config ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+  add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} ops_info_gen_${BINCMP_COMPUTE_UNIT})
+endfunction()
+
+function(ascendc_device_library)
+    message(STATUS "Ascendc device library generating")
+    cmake_parse_arguments(DEVICE "" "TARGET;OPTION" "SRC" ${ARGN})
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+    )
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E echo "cmake_minimum_required(VERSION 3.16.0)\nproject(cust_tiling_sink)\ninclude(${CMAKE_SOURCE_DIR}/cmake/device_task.cmake)\n"
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+        RESULT_VARIABLE result
+    )
+    string(REPLACE ";" " " DEVICE_SRC "${DEVICE_SRC}")
+    ExternalProject_Add(tiling_sink_task
+        SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DASCEND_CANN_PACKAGE_PATH=${ASCEND_CANN_PACKAGE_PATH}
+        -DTARGET=${DEVICE_TARGET}
+        -DOPTION=${DEVICE_OPTION}
+        -DSRC=${DEVICE_SRC}
+        -DVENDOR_NAME=${vendor_name}
+        <SOURCE_DIR>
+        CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+        INSTALL_COMMAND ""
+        BUILD_ALWAYS TRUE
+    )
+    ExternalProject_Get_Property(tiling_sink_task BINARY_DIR)
+    set(TILINGSINK_LIB_PATH "")
+    if ("${DEVICE_OPTION}" STREQUAL "SHARED")
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.so")
+    else()
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.a")
+    endif()
+    install(FILES ${TILINGSINK_LIB_PATH}
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_master_device/lib
+    )
+endfunction()
+function(add_opregistry_target)
+  string(REPLACE ";" "-" COMPUTE_UNIT "${ASCEND_COMPUTE_UNIT}")
+  add_custom_target(op_registry_pack
+                    COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_opregistry.py
+                    --input-path=${CMAKE_SOURCE_DIR}/build_out/
+                    --base-path=${CMAKE_SOURCE_DIR}/build_out/tmp/vendors/
+                    --output-path=${CMAKE_SOURCE_DIR}/build_out/library/
+                    --vendor-name=${vendor_name}
+                    --compute-unit=${COMPUTE_UNIT}
+                    --framework-type=${ASCEND_FRAMEWORK_TYPE}
+                    --platform=${CMAKE_SYSTEM_PROCESSOR})
+  add_library(ascend_opregistry INTERFACE)
+  target_link_libraries(ascend_opregistry INTERFACE opregistry)
+  target_link_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  target_include_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  add_dependencies(ascend_opregistry op_registry_pack)
+  if(EXISTS  "${CMAKE_SOURCE_DIR}/framework/caffe_plugin")
+    add_dependencies(op_registry_pack cust_caffe_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/tf_plugin")
+    add_dependencies(op_registry_pack cust_tf_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/onnx_plugin")
+    add_dependencies(op_registry_pack cust_onnx_parsers)
+  endif()
+endfunction()
+
+function(add_kernels_install)
+  # install kernel file
+  if (${ENABLE_SOURCE_PACKAGE})
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic/
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic/
+    )
+  endif()
+
+  # install *.o files and *.json files 
+  if (${ENABLE_BINARY_PACKAGE})
+    set(INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/)
+      foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+        install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/
+                DESTINATION ${INSTALL_DIR}/kernel/${compute_unit}/
+        )
+      endforeach()
+      install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/config/
+              DESTINATION ${INSTALL_DIR}/kernel/config/
+      )
+  endif()
+endfunction()
+
+function(add_kernels_compile)
+  set(DYNAMIC_PATH "")
+  if (${ENABLE_SOURCE_PACKAGE})
+    set(DYNAMIC_PATH ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic)
+    execute_process(COMMAND sh -c "mkdir -p ${DYNAMIC_PATH} &&
+                                  cp -rf ${CMAKE_SOURCE_DIR}/op_kernel/* ${DYNAMIC_PATH}/ &&
+                                  rm ${DYNAMIC_PATH}/CMakeLists.txt"
+                    RESULT_VARIABLE EXEC_RESULT
+                    ERROR_VARIABLE EXEC_ERROR
+    )
+    if (${EXEC_RESULT})
+      message(FATAL_ERROR, "copy_source_files failed, gen error:${EXEC_ERROR}" )
+    endif()
+  endif()
+
+  foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+      OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+      INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # define a target:binary to prevent kernel file from being rebuilt during the preinstall process
+    if (NOT TARGET binary)
+      add_custom_target(binary)
+    endif()
+
+    if (${ENABLE_BINARY_PACKAGE} OR ${ENABLE_SOURCE_PACKAGE})
+      if (${ENABLE_BINARY_PACKAGE})
+        # gen binary_info_config.json and <file_name>.json
+        add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                                  -s ${compute_unit}
+                          COMMAND ${CMAKE_COMMAND} -E make_directory
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+                          COMMAND mv ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/*.json
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+        )
+      else()
+        if (NOT TARGET ascendc_bin_${compute_unit}_gen_ops_config)
+          add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config)
+        endif()
+      endif()
+      add_dependencies(binary ascendc_bin_${compute_unit}_gen_ops_config)
+
+      # get op_type-op_name from aic-${compute_unit}-ops-info.ini
+      execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_get_op_name.py
+                              --ini-file=${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+                      OUTPUT_VARIABLE OP_TYPE_NAME
+                      RESULT_VARIABLE EXEC_RESULT
+                      ERROR_VARIABLE EXEC_ERROR
+      )
+      if (${EXEC_RESULT})
+        message(FATAL_ERROR, "get op name failed, gen error: ${EXEC_ERROR}")
+      endif()
+
+      # compile op one by one with ascendc_compile_kernel.py
+      string(REPLACE "\n" ";" TYPE_NAME_LIST "${OP_TYPE_NAME}")
+      foreach(TYPE_NAME IN LISTS TYPE_NAME_LIST)
+        if (NOT "${TYPE_NAME}" STREQUAL "")
+          string(REPLACE "-" ";" bin_sep ${TYPE_NAME})
+          list(GET bin_sep 0 op_type)
+          list(GET bin_sep 1 op_file)
+          add_simple_kernel_compile(OP_TYPE ${op_type}
+                                    SRC ${CMAKE_SOURCE_DIR}/op_kernel/${op_file}.cpp
+                                    COMPUTE_UNIT ${compute_unit}
+                                    JSON_FILE ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+                                    DYNAMIC_PATH ${DYNAMIC_PATH})
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+
+  # generate npu_supported_ops.json
+  add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+  )
+
+  if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+  endif()
+
+  if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_kernels_install()
+  else()
+    add_opregistry_target()
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a6279ff469e98190072e341d2058941f197ccbf
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/op_host/CMakeLists.txt
@@ -0,0 +1,183 @@
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH  $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+file(GLOB group_proto_src ${ASCEND_AUTOGEN_PATH}/group_proto/*.cc)
+
+add_library(cust_op_proto SHARED
+    $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+    ${ops_srcs}
+    ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_include_directories(cust_op_proto PRIVATE ${ATVC_PATH})
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                    cust_opsproto_rt2.0
+)
+file(GLOB fallback_src ${ASCEND_AUTOGEN_PATH}/fallback_*.cpp)
+add_library(cust_optiling SHARED ${ops_srcs})
+if (${fallback_src})
+    target_sources(cust_optiling PRIVATE ${fallback_src})
+endif()
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+target_include_directories(cust_optiling PRIVATE
+                            ${ATVC_PATH}
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        nnopbase
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+            cust_opmaster_rt2.0
+)
+
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_library(cust_opapi SHARED ${aclnn_src})
+else()
+    file(GLOB op_registry ${ASCEND_AUTOGEN_PATH}/custom_op_registry.cpp)
+    add_library(cust_opapi SHARED ${aclnn_src} ${op_registry})
+    target_compile_definitions(cust_opapi PRIVATE ACLNN_WITH_BINARY)
+endif()
+
+target_include_directories(cust_opapi PRIVATE
+                            ${ATVC_PATH}
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+else()
+    add_library(cust_op_proto_obj OBJECT
+                $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+                ${ops_srcs}
+                ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+    )
+    target_compile_definitions(cust_op_proto_obj PRIVATE OP_PROTO_LIB)
+    target_compile_options(cust_op_proto_obj PRIVATE
+                            -fvisibility=hidden
+    )
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_op_proto_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+        )
+    endif()
+    target_link_libraries(cust_op_proto_obj PRIVATE
+                            intf_pub
+                            exe_graph
+                            register
+                            tiling_api
+                            -Wl,--whole-archive
+                            rt2_registry
+                            -Wl,--no-whole-archive
+    )
+    add_library(cust_optiling_obj OBJECT ${ops_srcs})
+    target_compile_definitions(cust_optiling_obj PRIVATE OP_TILING_LIB)
+    target_compile_options(cust_optiling_obj PRIVATE 
+                        -fvisibility=hidden
+    )
+    target_include_directories(cust_optiling_obj PRIVATE ${ATVC_PATH})
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_optiling_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+        )
+    endif()
+
+    target_link_libraries(cust_optiling_obj PRIVATE
+                        intf_pub
+                        exe_graph
+                        register
+                        tiling_api
+                        -Wl,--whole-archive
+                        rt2_registry
+                        -Wl,--no-whole-archive
+    )
+    target_compile_options(cust_opapi PRIVATE -DLOG_CPP)
+    target_include_directories(cust_opapi INTERFACE ${ATVC_PATH})
+    target_include_directories(cust_opapi INTERFACE ${CMAKE_SOURCE_DIR}/build_out/binary/)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase cust_optiling_obj cust_op_proto_obj ascend_opregistry ascend_kernels)
+    add_dependencies(cust_opapi ascend_opregistry)
+endif()
+
+add_custom_target(optiling_compat ALL 
+                 COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                         ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+        install(TARGETS cust_op_proto
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+                DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        file(GLOB GROUP_PROTO_HEADERS ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        if (GROUP_PROTO_HEADERS)
+                install(FILES ${GROUP_PROTO_HEADERS}
+                        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        endif()
+        install(TARGETS cust_optiling
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+                DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+        install(TARGETS cust_opapi
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+        install(FILES ${aclnn_inc}
+                DESTINATION packages/vendors/${vendor_name}/op_api/include)
+else()
+        file(GLOB group_inc ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        install(TARGETS cust_opapi
+            LIBRARY DESTINATION op_api/lib)
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+            DESTINATION op_api/include)
+        install(FILES ${group_inc}
+            DESTINATION op_api/include)
+        install(FILES ${aclnn_inc}
+            DESTINATION op_api/include)
+endif()
diff --git a/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp b/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..da7c87ba15964e262435e3a7601d10819224e191
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/op_host/add_custom.cpp
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "elewise/elewise_host.h"
+#include "register/op_def_registry.h"
+
+using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
+using AddOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t, int32_t>, ATVC::OpOutputs<int32_t>>;
+   
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    // 声明运行态参数tiling
+    ATVC::EleWiseParam *tiling = context->GetTilingData<ATVC::EleWiseParam>();
+    uint32_t totleLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize();
+    if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) {
+        // AddOpTraitsFloat为ADD算子描述原型，根据算子输入输出个数和实际元素数量计算出Tiling数据后填入tiling中
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totleLength, *tiling);
+    } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) {
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsInt>(totleLength, *tiling);
+    }
+    // 设置tilingkey
+    context->SetTilingKey(0);
+    // 设置blockDim的大小
+    context->SetBlockDim(tiling->tilingData.blockNum);
+    // 设置Workspace的大小
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+    return ge::GRAPH_SUCCESS;
+}
+}  // namespace optiling
+
+namespace ge {
+static graphStatus InferShape(gert::InferShapeContext *context)
+{
+    const gert::Shape *x1_shape = context->GetInputShape(0);
+    gert::Shape *y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+
+static graphStatus InferDataType(gert::InferDataTypeContext *context)
+{
+    const auto inputDataType = context->GetInputDataType(0);
+    context->SetOutputDataType(0, inputDataType);
+    return ge::GRAPH_SUCCESS;
+}
+}  // namespace ge
+
+namespace ops {
+class AddCustom : public OpDef {
+public:
+    explicit AddCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Input("y")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("z")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType);
+        this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b");
+    }
+};
+OP_ADD(AddCustom);
+}  // namespace ops
diff --git a/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt b/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16f4f8e4aa572b35d908cb42f2c4ffbdb91b2c5d
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/op_kernel/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_VERSION_MAKEFILE ON)
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH  $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+
+add_ops_compile_options(ALL OPTIONS -g -O0 --cce-aicore-block-local-init -I ${ATVC_PATH})
+add_kernels_compile()
\ No newline at end of file
diff --git a/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp b/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d28d2c6c93b669480b2f0570b9301cd57a9105fa
--- /dev/null
+++ b/atvc/examples/aclnn/add/AddCustom/op_kernel/add_custom.cpp
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "elewise/elewise_device.h"
+
+using AddOpTraits = ATVC::OpTraits<ATVC::OpInputs<DTYPE_X, DTYPE_Y>, ATVC::OpOutputs<DTYPE_Z>>;
+
+template <typename Traits>
+struct AddComputeFunc {
+    template <typename T>
+    __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z)
+    {
+        AscendC::Add(z, x, y, z.GetSize());
+    }
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+    REGISTER_TILING_DEFAULT(ATVC::EleWiseParam);    
+    auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<AddOpTraits>>();
+    op.Run(x, y, z, tiling);    
+}
diff --git a/atvc/examples/aclnn/add/README.md b/atvc/examples/aclnn/add/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5eb359d02f3ee5c9e50a6c5f3a582b3f880af058
--- /dev/null
+++ b/atvc/examples/aclnn/add/README.md
@@ -0,0 +1,109 @@
+## 概述
+本样例基于AddCustom算子工程，介绍了基于ATVC的单算子工程、单算子调用。
+
+## 目录结构介绍
+```
+├── add       
+│   ├── AclNNInvocationNaive   // 通过aclnn调用的方式调用AddCustom算子
+│   ├── AddCustom              // AddCustom算子工程
+│   ├── AddCustom.json         // AddCustom算子的原型定义json文件
+│   └── install.sh             // 脚本，调用msOpGen生成自定义算子工程，并编译
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">add_custom</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品 
+
+## 算子工程介绍
+其中，算子工程目录AddCustom包含算子的实现文件，如下所示：
+```
+├── AddCustom               // AddCustom自定义算子工程
+│   ├── op_host             // host侧实现文件
+│   ├── op_kernel           // kernel侧实现文件
+│   ├── build.sh            // 算子构建入口
+│   └── CMakeLists.txt      // 算子的cmake文件
+```
+CANN软件包中提供了工程创建工具msOpGen，AddCustom算子工程可通过AddCustom.json自动创建，自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。
+
+创建完自定义算子工程后，开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程，本样例已在AddCustom目录中准备好了必要的算子实现，install.sh脚本会创建一个CustomOp目录，并将算子实现文件复制到对应目录下，再编译算子。
+
+备注：CustomOp目录为生成目录，每次执行install.sh脚本都会删除该目录并重新生成，切勿在该目录下编码算子，会存在丢失风险。
+
+## 编译运行样例算子
+针对自定义算子工程，编译运行包含如下步骤：
+- 调用msOpGen工具生成自定义算子工程；
+- 基于ATVC框架完成算子host和kernel实现；
+- 编译自定义算子工程生成自定义算子包；
+- 安装自定义算子包到自定义算子库中；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+### 1. 获取源码包及环境配置
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
+
+### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+  - 切换到msOpGen脚本install.sh所在目录
+    ```bash
+    # 若开发者以git命令行方式clone了master分支代码，并切换目录
+    cd atvc/examples/aclnn/add
+    ```
+
+  - 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
+
+      运行install.sh脚本
+      ```bash
+      # 以910B1为例，运行命令如下：
+      bash install.sh -v [SOC_VERSION]
+      ```    
+    参数说明：
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+        - Atlas A2训练系列产品
+
+    脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+### 3. 部署自定义算子包
+- 部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+    ```bash
+    echo $ASCEND_OPP_PATH
+    # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+    # 若没有，则需导出CANN环境变量
+    source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+    # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+    ```
+    参数说明：
+    - ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+
+- 在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+    ```bash
+    cd CustomOp/build_out
+    ./custom_opp_<target os>_<target architecture>.run
+    ```
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。
+
+### 4. 调用执行算子工程
+- [aclnn调用AddCustom算子工程](./AclNNInvocationNaive/README.md)
+
+## 更新说明
+| 时间       | 更新事项                     |
+| ---------- | ---------------------------- |
+| 2025/07/24 | 新增readme                   |
diff --git a/atvc/examples/aclnn/add/install.sh b/atvc/examples/aclnn/add/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5ca57e7ebfea0ed405ccffa131a6c42ea94a5fe2
--- /dev/null
+++ b/atvc/examples/aclnn/add/install.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+SHORT=v:,i:,
+LONG=soc-version:,install-path:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH
+
+OP_NAME=AddCustom
+rm -rf CustomOp
+# Generate the op framework
+msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp
+# Copy op implementation files to CustomOp
+cp -rf $OP_NAME/* CustomOp
+# Delete tiling.h
+rm -rf CustomOp/op_host/*._tiling.h
+# Build CustomOp project
+(cd CustomOp && bash build.sh)
\ No newline at end of file
diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9a92ffe7d9af319c4ebfb69925293080cc9ac2a1
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+
+# CMake lowest version requirement
+cmake_minimum_required(VERSION 3.5.1)
+
+# project information
+project(acl_execute_reduce_sum)
+
+# Compile options
+add_compile_options(-std=c++11)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./")
+
+set(INC_PATH $ENV{DDK_PATH})
+
+if (NOT DEFINED ENV{DDK_PATH})
+    set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest")
+    message(STATUS "set default INC_PATH: ${INC_PATH}")
+else ()
+    message(STATUS "env INC_PATH: ${INC_PATH}")
+endif()
+
+set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api")
+
+set(LIB_PATH $ENV{NPU_HOST_LIB})
+
+# Dynamic libraries in the stub directory can only be used for compilation
+if (NOT DEFINED ENV{NPU_HOST_LIB})
+    string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER)
+    set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib")
+    message(STATUS "set default LIB_PATH: ${LIB_PATH}")
+else ()
+    message(STATUS "env LIB_PATH: ${LIB_PATH}")
+endif()
+
+# Header path
+include_directories(
+    ${INC_PATH}/include
+    ${CUST_PKG_PATH}/include
+)
+
+# add host lib path
+link_directories(
+    ${LIB_PATH}
+    ${CUST_PKG_PATH}/lib
+)
+
+add_executable(execute_reduce_sum_op
+    main.cpp
+)
+
+target_link_libraries(execute_reduce_sum_op
+    ascendcl
+    cust_opapi
+    acl_op_compiler
+    nnopbase
+    stdc++
+)
+
+install(TARGETS execute_reduce_sum_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4bc6770ff8e0658d91afb7106907e6c2633c8b31
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/README.md
@@ -0,0 +1,52 @@
+## 概述
+本样例相比于AclNNInvocation样例工程，简化了工程配置。
+## 目录结构介绍
+```
+├── AclNNInvocationNaive
+│   ├── CMakeLists.txt      // 编译规则文件
+│   ├── main.cpp            // 单算子调用应用的入口
+│   └── run.sh              // 编译运行算子的脚本
+```
+## 代码实现介绍
+完成自定义算子的开发部署后，可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子，无需提供单算子描述文件进行离线模型的转换，直接调用单算子API接口。
+
+自定义算子编译部署后，会自动生成单算子API，可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”，形如：
+   ```cpp
+   // 获取算子使用的workspace空间大小
+   aclnnStatus aclnnReduceSumCustomGetWorkspaceSize(const aclTensor *x, const aclIntArrat *dim, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor);
+   // 执行算子
+   aclnnStatus aclnnReduceSumCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream);
+   ```
+其中`aclnnReduceSumCustomGetWorkspaceSize`为第一段接口，主要用于计算本次API调用计算过程中需要多少的`workspace`内存。获取到本次API计算需要的`workspace`大小之后，开发者按照`workspaceSize`大小申请Device侧内存，然后调用第二段接口`aclnnReduceSumCustom`执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。
+## 运行样例算子
+### 1. 编译算子工程
+运行此样例前，请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。
+### 2. aclnn调用样例运行
+
+  - 进入到样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive
+    ```
+  - 样例编译文件修改
+
+    将CMakeLists.txt文件内"/usr/local/Ascend/ascend-toolkit/latest"替换为CANN软件包安装后的实际路径。  
+    eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest
+
+  - 环境变量配置
+
+    需要设置NPU_HOST_LIB环境变量，以x86为例
+    ```bash
+    export NPU_HOST_LIB=/home/HwHiAiUser/Ascend/ascend-toolkit/latest/x86_64-linux/lib64
+    ```
+  - 样例执行
+
+    用户参考run.sh脚本进行编译与运行。
+    ```bash
+    bash run.sh
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2025/07/22 | 新增本readme |
\ No newline at end of file
diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f748e7fab273fa66581989d959e7b8f725d73494
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/main.cpp
@@ -0,0 +1,195 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn_reduce_sum_custom.h"
+
+namespace {
+#define CHECK_RET(cond, return_expr) \
+    do {                             \
+        if (!(cond)) {               \
+            return_expr;             \
+        }                            \
+    } while (0)
+
+#define LOG_PRINT(message, ...)         \
+    do {                                \
+        printf(message, ##__VA_ARGS__); \
+    } while (0)
+
+int64_t GetShapeSize(const std::vector<int64_t> &shape)
+{
+    int64_t shapeSize = 1;
+    for (auto i : shape) {
+        shapeSize *= i;
+    }
+    return shapeSize;
+}
+
+int Init(int32_t deviceId, aclrtStream *stream)
+{
+    // Fixed code, acl initialization
+    auto ret = aclInit(nullptr);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return 1);
+    ret = aclrtSetDevice(deviceId);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return 1);
+    ret = aclrtCreateStream(stream);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return 1);
+
+    return 0;
+}
+
+template <typename T>
+int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
+                    aclDataType dataType, aclTensor **tensor)
+{
+    auto size = GetShapeSize(shape) * sizeof(T);
+    // Call aclrtMalloc to allocate device memory
+    auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return 1);
+
+    // Call aclrtMemcpy to copy host data to device memory
+    ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return 1);
+
+    // Call aclCreateTensor to create a aclTensor object
+    *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(),
+                              shape.size(), *deviceAddr);
+    return 0;
+}
+
+void DestroyResources(std::vector<void *> tensors, std::vector<void *> deviceAddrs, aclrtStream stream,
+                      int32_t deviceId, void *workspaceAddr = nullptr)
+{
+    // Release aclTensor and device
+    for (uint32_t i = 0; i < tensors.size(); i++) {
+        if (tensors[i] != nullptr) {
+            aclDestroyTensor(reinterpret_cast<aclTensor *>(tensors[i]));
+        }
+        if (deviceAddrs[i] != nullptr) {
+            aclrtFree(deviceAddrs[i]);
+        }
+    }
+    if (workspaceAddr != nullptr) {
+        aclrtFree(workspaceAddr);
+    }
+    // Destroy stream and reset device
+    aclrtDestroyStream(stream);
+    aclrtResetDevice(deviceId);
+    aclFinalize();
+}
+
+void InitializeData(std::vector<float> &inputX, std::vector<float> &outputY, std::vector<float> &golden,
+    std::vector<int64_t> &inputXShape, std::vector<int64_t> &outputYShape)
+{
+    for (int i = 0; i < inputXShape[0] * inputXShape[1]; ++i) {
+        inputX[i] = 1.0;
+    }
+    float dealResult = 8.0;
+    for (int i = 0; i < outputYShape[0] * outputYShape[1]; ++i) {
+        golden[i] = dealResult;
+        outputY[i] = 0.0;
+    }
+}
+
+bool VerifyResults(const std::vector<float> &goldenData, const std::vector<float> &resultData)
+{
+    int64_t len = 10;
+    LOG_PRINT("result is:\n");
+    for (int64_t i = 0; i < len; i++) {
+        LOG_PRINT("%.1f ", resultData[i]);
+    }
+    LOG_PRINT("\n");
+    if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) {
+        LOG_PRINT("test pass\n");
+    } else {
+        LOG_PRINT("test failed\n");
+        return false;
+    }
+    return true;
+}
+}
+
+int main(int argc, char **argv)
+{
+    // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl
+    // Update deviceId to your own device id
+    int32_t deviceId = 0;
+    aclrtStream stream;
+    auto ret = Init(deviceId, &stream);
+    CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return 1);
+
+    // 2. Create input and output, need to customize according to the interface of the API
+    std::vector<int64_t> inputXShape = {8, 2048}; 
+    std::vector<int64_t> outputYShape = {1, 2048};
+    void *inputXDeviceAddr = nullptr;  
+    void *outputYDeviceAddr = nullptr;
+    aclTensor *inputX = nullptr; 
+    aclTensor *outputY = nullptr;
+    std::vector<float> inputXHostData(inputXShape[0] * inputXShape[1]); 
+    std::vector<float> outputYHostData(outputYShape[0] * outputYShape[1]); 
+    std::vector<float> goldenData(outputYShape[0] * outputYShape[1]);
+ 
+    InitializeData(inputXHostData, outputYHostData, goldenData, inputXShape, outputYShape);
+    std::vector<void *> tensors = {inputX, outputY};
+    std::vector<void *> deviceAddrs = {inputXDeviceAddr, outputYDeviceAddr};
+    // Create inputX aclTensor
+    ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+    // Create outputY aclTensor
+    ret = CreateAclTensor(outputYHostData, outputYShape, &outputYDeviceAddr, aclDataType::ACL_FLOAT, &outputY);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);
+    // Create dimOut aclIntArray
+    std::vector<int64_t> dim{0};
+    aclIntArray* dimOut = aclCreateIntArray(dim.data(), dim.size());
+    // 3. Call the API of the custom operator library
+    uint64_t workspaceSize = 0;
+    aclOpExecutor *executor;
+    // Calculate the workspace size and allocate memory for it
+    ret = aclnnReduceSumCustomGetWorkspaceSize(inputX, dimOut, outputY, &workspaceSize, &executor);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnReduceSumCustomGetWorkspaceSize failed. ERROR: %d\n", ret);
+              DestroyResources(tensors, deviceAddrs, stream, deviceId); return 1);   
+    void *workspaceAddr = nullptr;
+    if (workspaceSize > 0U) {
+        ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+    }
+    // Execute the custom operator
+    ret = aclnnReduceSumCustom(workspaceAddr, workspaceSize, executor, stream);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnReduceSumCustom failed. ERROR: %d\n", ret);
+              DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 4. (Fixed code) Synchronize and wait for the task to complete
+    ret = aclrtSynchronizeStream(stream);
+    CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the
+    // interface of the API
+    auto size = GetShapeSize(outputYShape);
+    std::vector<float> resultData(size, 0);
+    ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputYDeviceAddr,
+                      size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST);
+    CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret);
+              DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return 1);
+
+    // 6. Detroy resources, need to modify according to the interface of the API
+    DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr);
+
+    // print the output result
+    if (!VerifyResults(goldenData, resultData)) {        
+        return -1;
+    }
+    return 0;
+}
diff --git a/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9bb88021ddeeb0aaca15af2eb5f463cbe5a3a8fe
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/AclNNInvocationNaive/run.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export DDK_PATH=$_ASCEND_INSTALL_PATH
+export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib
+
+set -e
+rm -rf build
+mkdir -p build
+cmake -B build -DCMAKE_SKIP_RPATH=TRUE
+cmake --build build -j
+(
+    cd build
+    export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH
+    ./execute_reduce_sum_op
+)
diff --git a/atvc/examples/aclnn/reduce_sum/README.md b/atvc/examples/aclnn/reduce_sum/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..693753811240e87033d47a734414852b0bad8299
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/README.md
@@ -0,0 +1,105 @@
+## 概述
+本样例基于ReduceSumCustom算子工程，介绍了基于ATVC的单算子工程、单算子调用。
+
+## 目录结构介绍
+```
+├── reduce_sum                 // 使用框架调用的方式调用ReduceSum算子
+│   ├── AclNNInvocation        // 通过aclnn调用的方式调用ReduceSumCustom算子
+│   ├── ReduceSumCustom        // ReduceSumCustom算子工程
+│   ├── ReduceSumCustom.json   // ReduceSumCustom算子的原型定义json文件
+│   └── install.sh             // 脚本，调用msOpGen生成自定义算子工程，并编译
+```
+
+## 算子描述
+ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceSum</td></tr>
+</tr>
+<tr><td rowspan="2" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+ 
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">1 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">reduce_sum_custom</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品
+
+## 算子工程介绍
+其中，算子工程目录ReduceSumCustom包含算子的实现文件，如下所示：
+```
+├── ReduceSumCustom           // ReduceSumCustom自定义算子工程
+│   ├── op_host               // host侧实现文件
+│   ├── op_kernel             // kernel侧实现文件
+│   ├── build.sh              // 算子构建入口
+│   └── CMakeLists.txt        // 算子的cmake文件
+```
+CANN软件包中提供了工程创建工具msOpGen，ReduceSumCustom算子工程可通过ReduceSumCustom.json自动创建，自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。
+
+创建完自定义算子工程后，开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程，本样例已在ReduceSumCustom目录中准备好了必要的算子实现，install.sh脚本会创建一个CustomOp目录，并将算子实现文件复制到对应目录下，再编译算子。
+
+备注：CustomOp目录为生成目录，每次执行install.sh脚本都会删除该目录并重新生成，切勿在该目录下编码算子，会存在丢失风险。
+
+## 编译运行样例算子
+针对自定义算子工程，编译运行包含如下步骤：
+- 调用msOpGen工具生成自定义算子工程；
+- 基于ATVC框架完成算子host和kernel实现；
+- 编译自定义算子工程生成自定义算子包；
+- 安装自定义算子包到自定义算子库中；
+- 调用执行自定义算子；
+
+详细操作如下所示。
+### 1. 获取源码包及环境配置
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
+
+### 2. 生成自定义算子工程，复制host和kernel实现并编译算子<a name="operatorcompile"></a>
+  - 切换到msOpGen脚本install.sh所在目录
+    ```bash
+    # 若开发者以git命令行方式clone了master分支代码，并切换目录
+    cd ./atvc/examples/aclnn/reduce_sum
+    ```
+
+  - 调用脚本，生成自定义算子工程，复制host和kernel实现并编译算子
+ 
+      运行install.sh脚本
+      ```bash
+      bash install.sh -v [SOC_VERSION]
+      ``` 
+    参数说明：
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+        - Atlas A2训练系列产品 
+
+    脚本运行成功后，会在当前目录下创建CustomOp目录，编译完成后，会在CustomOp/build_out中，生成自定义算子安装包custom_opp_\<target os>_\<target architecture>.run，例如“custom_opp_ubuntu_x86_64.run”。
+
+### 3. 部署自定义算子包
+- 部署自定义算子包前，请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH
+    ```bash
+    echo $ASCEND_OPP_PATH
+    # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp
+
+    # 若没有，则需导出CANN环境变量
+    source [ASCEND_INSTALL_PATH]/bin/setenv.bash
+    # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash
+    ```
+    参数说明：
+    - ASCEND_INSTALL_PATH：CANN软件包安装路径，一般和上一步中指定的路径保持一致
+
+- 在自定义算子安装包所在路径下，执行如下命令安装自定义算子包
+    ```bash
+    cd CustomOp/build_out
+    ./custom_opp_<target os>_<target architecture>.run
+    ```
+  命令执行成功后，自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。
+
+### 4. 调用执行算子工程
+- [aclnn调用ReduceSumCustom算子工程](./AclNNInvocationNaive/README.md)
+
+## 更新说明
+| 时间       | 更新事项                     |
+| ---------- | ---------------------------- |
+| 2025/07/24 | 新增readme                  |
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json
new file mode 100644
index 0000000000000000000000000000000000000000..c214368089dea1b9c4d98f18576d7b7e7bd0fe31
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom.json
@@ -0,0 +1,41 @@
+[
+    {
+        "op": "ReduceSumCustom",
+        "language": "cpp",
+        "input_desc": [
+            {
+                "name": "x",
+                "param_type": "required",
+                "format": [
+                    "ND",
+                    "ND"
+                ],
+                "type": [
+                    "float32",
+                    "int32"
+                ]
+            }
+        ],
+        "output_desc": [
+            {
+                "name": "y",
+                "param_type": "required",
+                "format": [
+                    "ND",
+                    "ND"
+                ],
+                "type": [
+                    "float32",
+                    "int32"
+                ]
+            }
+        ],
+        "attr": [
+            {
+                "name": "dim",
+                "param_type": "required",
+                "type": "list_int"
+            }
+        ]
+    }
+]
\ No newline at end of file
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7e1f43e5a2ed42c2ee3395308378e61eb15606f
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/CMakeLists.txt
@@ -0,0 +1,76 @@
+cmake_minimum_required(VERSION 3.16.0)
+project(opp)
+
+include(cmake/config.cmake)
+include(cmake/func.cmake)
+include(cmake/intf.cmake)
+
+if(ENABLE_CROSS_COMPILE)
+    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64)
+        set(CROSS_COMPILE_PLATFORM aarch64)
+    else()
+        set(CROSS_COMPILE_PLATFORM x86_64)
+    endif()
+    set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR})
+    set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/)
+    set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/)
+    if(CMAKE_CROSS_LIBRARY_PATH)
+        set(CMAKE_COMPILE_COMPILER_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH})
+        set(CMAKE_COMPILE_RUNTIME_LIBRARY ${CMAKE_CROSS_LIBRARY_PATH})
+    endif()
+    set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM})
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+    set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER})
+else()
+    set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER})
+endif()
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework)
+    add_subdirectory(framework)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host)
+    add_subdirectory(op_host)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel)
+    add_subdirectory(op_kernel)
+endif()
+if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+endif()
+
+# modify vendor_name in install.sh and upgrade.sh
+add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh
+    COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/
+    COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/*
+)
+add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh)
+
+get_system_info(SYSTEM_INFO)
+
+# gen version.info
+add_custom_target(gen_version_info ALL
+        COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ)
+
+    install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info
+            DESTINATION packages/vendors/${vendor_name}/)
+
+    # CPack config
+    set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME})
+    set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
+    set(CPACK_PACKAGE_DESCRIPTION "CPack opp project")
+    set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project")
+    set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+    set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run")
+    set(CPACK_GENERATOR External)
+    set(CPACK_CMAKE_GENERATOR "Unix Makefiles")
+    set(CPACK_EXTERNAL_ENABLE_STAGING TRUE)
+    set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake)
+    set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME})
+    include(CPack)
+endif()
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e234f442aedc54b40f2d557d5ead27d7e8d880f5
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/cmake/func.cmake
@@ -0,0 +1,377 @@
+include(ExternalProject)
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH  $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+function(get_system_info SYSTEM_INFO)
+  if (UNIX)
+    execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP)
+    string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP})
+    set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE)
+  elseif (WIN32)
+    message(STATUS "System is Windows. Only for pre-build.")
+  else ()
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.")
+  endif ()
+endfunction()
+
+function(opbuild)
+  message(STATUS "Opbuild generating sources")
+  cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX;ENABLE_SOURCE" "OPS_SRC" ${ARGN})
+  execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++17 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0
+                  -I ${ASCEND_CANN_PACKAGE_PATH}/include -I ${CMAKE_CURRENT_SOURCE_DIR}/../op_kernel
+                  -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api -I ${ATVC_PATH}
+                  -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  if (${EXEC_RESULT})
+    message("build ops lib info: ${EXEC_INFO}")
+    message("build ops lib error: ${EXEC_ERROR}")
+    message(FATAL_ERROR "opbuild run failed!")
+  endif()
+  set(proj_env "")
+  set(prefix_env "")
+  if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x")
+    set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}")
+  endif()
+  if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x")
+    set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}")
+  endif()
+
+  set(ENV{ENABLE_SOURCE_PACAKGE} ${OPBUILD_ENABLE_SOURCE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT vendor_name)
+      message(FATAL_ERROR "ERROR: vendor_name is invalid!")
+      return()
+    endif()
+    set(ENV{ASCEND_VENDOR_NAME} ${vendor_name})
+    set(ENV{OPS_PRODUCT_NAME} ${ASCEND_COMPUTE_UNIT})
+    set(ENV{SYSTEM_PROCESSOR} ${CMAKE_SYSTEM_PROCESSOR})
+  endif()
+  execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build
+                          ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR
+  )
+  unset(ENV{ENABLE_SOURCE_PACAKGE})
+  if(${ASCEND_PACK_SHARED_LIBRARY})
+    unset(ENV{ASCEND_VENDOR_NAME})
+    unset(ENV{OPS_PRODUCT_NAME})
+    unset(ENV{SYSTEM_PROCESSOR})
+  endif()
+  if (${EXEC_RESULT})
+    message("opbuild ops info: ${EXEC_INFO}")
+    message("opbuild ops error: ${EXEC_ERROR}")
+  endif()
+  message(STATUS "Opbuild generating sources - done")
+endfunction()
+
+function(add_ops_info_target)
+  cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY)
+  add_custom_command(OUTPUT ${OPINFO_OUTPUT}
+      COMMAND mkdir -p ${opinfo_file_path}
+      COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py
+              ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT}
+  )
+  add_custom_target(${OPINFO_TARGET} ALL
+      DEPENDS ${OPINFO_OUTPUT}
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${OPINFO_OUTPUT}
+            DESTINATION ${OPINFO_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_ops_compile_options OP_TYPE)
+  cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN})
+  execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_gen_options.py
+                          ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} ${OP_TYPE} ${OP_COMPILE_COMPUTE_UNIT}
+                          ${OP_COMPILE_OPTIONS}
+                  RESULT_VARIABLE EXEC_RESULT
+                  OUTPUT_VARIABLE EXEC_INFO
+                  ERROR_VARIABLE  EXEC_ERROR)
+  if (${EXEC_RESULT})
+      message("add ops compile options info: ${EXEC_INFO}")
+      message("add ops compile options error: ${EXEC_ERROR}")
+      message(FATAL_ERROR "add ops compile options failed!")
+  endif()
+endfunction()
+
+function(add_npu_support_target)
+  cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN})
+  get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY)
+  add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+    COMMAND mkdir -p ${NPUSUP_OUT_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh
+            ${NPUSUP_OPS_INFO_DIR}
+            ${NPUSUP_OUT_DIR}
+  )
+  add_custom_target(npu_supported_ops ALL
+    DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+  )
+  if(NOT ${ASCEND_PACK_SHARED_LIBRARY})
+    install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json
+      DESTINATION ${NPUSUP_INSTALL_DIR}
+    )
+  endif()
+endfunction()
+
+function(add_simple_kernel_compile)
+  set(options "")
+  set(single_value_args "OPS_INFO;OUT_DIR;TILING_LIB;OP_TYPE;SRC;COMPUTE_UNIT;JSON_FILE;DYNAMIC_PATH")
+  set(multi_value_args "OPTIONS;CONFIGS")
+  cmake_parse_arguments(BINCMP "${options}" "${single_value_args}" "${multi_value_args}" ${ARGN})
+  if (NOT DEFINED BINCMP_OUT_DIR)
+    set(BINCMP_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary)
+  endif()
+  if (NOT DEFINED BINCMP_TILING_LIB)
+    set(BINCMP_TILING_LIB $<TARGET_FILE:cust_optiling>)
+  endif()
+  if (${ASCEND_PACK_SHARED_LIBRARY})
+    if (NOT TARGET op_kernel_pack)
+      add_custom_target(op_kernel_pack
+                        COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_kernel.py
+                        --input-path=${BINCMP_OUT_DIR}
+                        --output-path=${BINCMP_OUT_DIR}/library
+                        --enable-library=${ASCEND_PACK_SHARED_LIBRARY}
+                        --platform=${CMAKE_SYSTEM_PROCESSOR})
+      add_library(ascend_kernels INTERFACE)
+      target_link_libraries(ascend_kernels INTERFACE kernels)
+      target_link_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      target_include_directories(ascend_kernels INTERFACE ${BINCMP_OUT_DIR}/library)
+      add_dependencies(ascend_kernels op_kernel_pack)
+      add_dependencies(op_kernel_pack ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+    endif()
+  endif()
+  # add Environment Variable Configurations of ccache
+  set(_ASCENDC_ENV_VAR)
+  if(${CMAKE_CXX_COMPILER_LAUNCHER} MATCHES "ccache$")
+    list(APPEND _ASCENDC_ENV_VAR export ASCENDC_CCACHE_EXECUTABLE=${CMAKE_CXX_COMPILER_LAUNCHER} &&)
+  endif()
+
+  if (NOT DEFINED BINCMP_OPS_INFO)
+    set(BINCMP_OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${BINCMP_COMPUTE_UNIT}-ops-info.ini)
+  endif()
+  if (NOT ${ENABLE_CROSS_COMPILE})
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${BINCMP_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+    add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} cust_optiling)
+  else()
+    if (${ENABLE_BINARY_PACKAGE} AND NOT DEFINED HOST_NATIVE_TILING_LIB)
+      message(FATAL_ERROR "Native host libs was not set for cross compile!")
+    endif()
+    add_custom_target(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT}
+                      COMMAND ${_ASCENDC_ENV_VAR} ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_compile_kernel.py
+                      --op-name=${BINCMP_OP_TYPE}
+                      --src-file=${BINCMP_SRC}
+                      --compute-unit=${BINCMP_COMPUTE_UNIT}
+                      --compile-options=\"${BINCMP_OPTIONS}\"
+                      --debug-config=\"${BINCMP_CONFIGS}\"
+                      --config-ini=${BINCMP_OPS_INFO}
+                      --tiling-lib=${HOST_NATIVE_TILING_LIB}
+                      --output-path=${BINCMP_OUT_DIR}
+                      --dynamic-dir=${BINCMP_DYNAMIC_PATH}
+                      --enable-binary=\"${ENABLE_BINARY_PACKAGE}\"
+                      --json-file=${BINCMP_JSON_FILE}
+                      --build-tool=$(MAKE))
+  endif()
+  add_dependencies(ascendc_bin_${BINCMP_COMPUTE_UNIT}_gen_ops_config ${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT})
+  add_dependencies(${BINCMP_OP_TYPE}_${BINCMP_COMPUTE_UNIT} ops_info_gen_${BINCMP_COMPUTE_UNIT})
+endfunction()
+
+function(ascendc_device_library)
+    message(STATUS "Ascendc device library generating")
+    cmake_parse_arguments(DEVICE "" "TARGET;OPTION" "SRC" ${ARGN})
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+    )
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E echo "cmake_minimum_required(VERSION 3.16.0)\nproject(cust_tiling_sink)\ninclude(${CMAKE_SOURCE_DIR}/cmake/device_task.cmake)\n"
+        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink/CMakeLists.txt
+        RESULT_VARIABLE result
+    )
+    string(REPLACE ";" " " DEVICE_SRC "${DEVICE_SRC}")
+    ExternalProject_Add(tiling_sink_task
+        SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/tiling_sink
+        CONFIGURE_COMMAND ${CMAKE_COMMAND}
+        -DASCEND_CANN_PACKAGE_PATH=${ASCEND_CANN_PACKAGE_PATH}
+        -DTARGET=${DEVICE_TARGET}
+        -DOPTION=${DEVICE_OPTION}
+        -DSRC=${DEVICE_SRC}
+        -DVENDOR_NAME=${vendor_name}
+        <SOURCE_DIR>
+        CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+        INSTALL_COMMAND ""
+        BUILD_ALWAYS TRUE
+    )
+    ExternalProject_Get_Property(tiling_sink_task BINARY_DIR)
+    set(TILINGSINK_LIB_PATH "")
+    if ("${DEVICE_OPTION}" STREQUAL "SHARED")
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.so")
+    else()
+        set(TILINGSINK_LIB_PATH "${BINARY_DIR}/libcust_opmaster.a")
+    endif()
+    install(FILES ${TILINGSINK_LIB_PATH}
+        DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_master_device/lib
+    )
+endfunction()
+function(add_opregistry_target)
+  string(REPLACE ";" "-" COMPUTE_UNIT "${ASCEND_COMPUTE_UNIT}")
+  add_custom_target(op_registry_pack
+                    COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_pack_opregistry.py
+                    --input-path=${CMAKE_SOURCE_DIR}/build_out/
+                    --base-path=${CMAKE_SOURCE_DIR}/build_out/tmp/vendors/
+                    --output-path=${CMAKE_SOURCE_DIR}/build_out/library/
+                    --vendor-name=${vendor_name}
+                    --compute-unit=${COMPUTE_UNIT}
+                    --framework-type=${ASCEND_FRAMEWORK_TYPE}
+                    --platform=${CMAKE_SYSTEM_PROCESSOR})
+  add_library(ascend_opregistry INTERFACE)
+  target_link_libraries(ascend_opregistry INTERFACE opregistry)
+  target_link_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  target_include_directories(ascend_opregistry INTERFACE ${CMAKE_SOURCE_DIR}/build_out/library)
+  add_dependencies(ascend_opregistry op_registry_pack)
+  if(EXISTS  "${CMAKE_SOURCE_DIR}/framework/caffe_plugin")
+    add_dependencies(op_registry_pack cust_caffe_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/tf_plugin")
+    add_dependencies(op_registry_pack cust_tf_parsers)
+  elseif(EXISTS  "${CMAKE_SOURCE_DIR}/framework/onnx_plugin")
+    add_dependencies(op_registry_pack cust_onnx_parsers)
+  endif()
+endfunction()
+
+function(add_kernels_install)
+  # install kernel file
+  if (${ENABLE_SOURCE_PACKAGE})
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic/
+            DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic/
+    )
+  endif()
+
+  # install *.o files and *.json files 
+  if (${ENABLE_BINARY_PACKAGE})
+    set(INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/)
+      foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+        install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/
+                DESTINATION ${INSTALL_DIR}/kernel/${compute_unit}/
+        )
+      endforeach()
+      install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/binary/config/
+              DESTINATION ${INSTALL_DIR}/kernel/config/
+      )
+  endif()
+endfunction()
+
+function(add_kernels_compile)
+  set(DYNAMIC_PATH "")
+  if (${ENABLE_SOURCE_PACKAGE})
+    set(DYNAMIC_PATH ${CMAKE_CURRENT_BINARY_DIR}/binary/dynamic)
+    execute_process(COMMAND sh -c "mkdir -p ${DYNAMIC_PATH} &&
+                                  cp -rf ${CMAKE_SOURCE_DIR}/op_kernel/* ${DYNAMIC_PATH}/ &&
+                                  rm ${DYNAMIC_PATH}/CMakeLists.txt"
+                    RESULT_VARIABLE EXEC_RESULT
+                    ERROR_VARIABLE EXEC_ERROR
+    )
+    if (${EXEC_RESULT})
+      message(FATAL_ERROR, "copy_source_files failed, gen error:${EXEC_ERROR}" )
+    endif()
+  endif()
+
+  foreach(compute_unit ${ASCEND_COMPUTE_UNIT})
+    # generate aic-${compute_unit}-ops-info.json
+    add_ops_info_target(TARGET ops_info_gen_${compute_unit}
+      OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+      OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+      INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit}
+    )
+
+    # define a target:binary to prevent kernel file from being rebuilt during the preinstall process
+    if (NOT TARGET binary)
+      add_custom_target(binary)
+    endif()
+
+    if (${ENABLE_BINARY_PACKAGE} OR ${ENABLE_SOURCE_PACKAGE})
+      if (${ENABLE_BINARY_PACKAGE})
+        # gen binary_info_config.json and <file_name>.json
+        add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                          COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py
+                                  -p ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}
+                                  -s ${compute_unit}
+                          COMMAND ${CMAKE_COMMAND} -E make_directory
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+                          COMMAND mv ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit}/*.json
+                                  ${CMAKE_CURRENT_BINARY_DIR}/binary/config/${compute_unit}
+        )
+      else()
+        if (NOT TARGET ascendc_bin_${compute_unit}_gen_ops_config)
+          add_custom_target(ascendc_bin_${compute_unit}_gen_ops_config)
+        endif()
+      endif()
+      add_dependencies(binary ascendc_bin_${compute_unit}_gen_ops_config)
+
+      # get op_type-op_name from aic-${compute_unit}-ops-info.ini
+      execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_get_op_name.py
+                              --ini-file=${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini
+                      OUTPUT_VARIABLE OP_TYPE_NAME
+                      RESULT_VARIABLE EXEC_RESULT
+                      ERROR_VARIABLE EXEC_ERROR
+      )
+      if (${EXEC_RESULT})
+        message(FATAL_ERROR, "get op name failed, gen error: ${EXEC_ERROR}")
+      endif()
+
+      # compile op one by one with ascendc_compile_kernel.py
+      string(REPLACE "\n" ";" TYPE_NAME_LIST "${OP_TYPE_NAME}")
+      foreach(TYPE_NAME IN LISTS TYPE_NAME_LIST)
+        if (NOT "${TYPE_NAME}" STREQUAL "")
+          string(REPLACE "-" ";" bin_sep ${TYPE_NAME})
+          list(GET bin_sep 0 op_type)
+          list(GET bin_sep 1 op_file)
+          add_simple_kernel_compile(OP_TYPE ${op_type}
+                                    SRC ${CMAKE_SOURCE_DIR}/op_kernel/${op_file}.cpp
+                                    COMPUTE_UNIT ${compute_unit}
+                                    JSON_FILE ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json
+                                    DYNAMIC_PATH ${DYNAMIC_PATH})
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+
+  # generate npu_supported_ops.json
+  add_npu_support_target(TARGET npu_supported_ops
+    OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH}
+    OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core
+    INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE}
+  )
+
+  if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases)
+    add_subdirectory(testcases)
+  endif()
+
+  if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_kernels_install()
+  else()
+    add_opregistry_target()
+  endif()
+endfunction()
\ No newline at end of file
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a6279ff469e98190072e341d2058941f197ccbf
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/CMakeLists.txt
@@ -0,0 +1,183 @@
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH  $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH  "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs)
+opbuild(OPS_SRC ${ops_srcs}
+        OUT_DIR ${ASCEND_AUTOGEN_PATH}
+)
+
+file(GLOB group_proto_src ${ASCEND_AUTOGEN_PATH}/group_proto/*.cc)
+
+add_library(cust_op_proto SHARED
+    $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+    ${ops_srcs}
+    ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+)
+target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB)
+target_compile_options(cust_op_proto PRIVATE
+        -fvisibility=hidden
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_op_proto PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_include_directories(cust_op_proto PRIVATE ${ATVC_PATH})
+target_link_libraries(cust_op_proto PRIVATE
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME
+                    cust_opsproto_rt2.0
+)
+file(GLOB fallback_src ${ASCEND_AUTOGEN_PATH}/fallback_*.cpp)
+add_library(cust_optiling SHARED ${ops_srcs})
+if (${fallback_src})
+    target_sources(cust_optiling PRIVATE ${fallback_src})
+endif()
+target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB)
+target_compile_options(cust_optiling PRIVATE
+        -fvisibility=hidden
+)
+target_include_directories(cust_optiling PRIVATE
+                            ${ATVC_PATH}
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_optiling PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+target_link_libraries(cust_optiling PRIVATE
+        nnopbase
+        intf_pub
+        exe_graph
+        register
+        tiling_api
+        -Wl,--whole-archive
+        rt2_registry
+        -Wl,--no-whole-archive
+)
+set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME
+            cust_opmaster_rt2.0
+)
+
+file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp)
+file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    add_library(cust_opapi SHARED ${aclnn_src})
+else()
+    file(GLOB op_registry ${ASCEND_AUTOGEN_PATH}/custom_op_registry.cpp)
+    add_library(cust_opapi SHARED ${aclnn_src} ${op_registry})
+    target_compile_definitions(cust_opapi PRIVATE ACLNN_WITH_BINARY)
+endif()
+
+target_include_directories(cust_opapi PRIVATE
+                            ${ATVC_PATH}
+)
+if(ENABLE_CROSS_COMPILE)
+    target_link_directories(cust_opapi PRIVATE
+                            ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                            ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+    )
+endif()
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase)
+else()
+    add_library(cust_op_proto_obj OBJECT
+                $<$<TARGET_EXISTS:group_proto_src>:${group_proto_src}>
+                ${ops_srcs}
+                ${ASCEND_AUTOGEN_PATH}/op_proto.cc
+    )
+    target_compile_definitions(cust_op_proto_obj PRIVATE OP_PROTO_LIB)
+    target_compile_options(cust_op_proto_obj PRIVATE
+                            -fvisibility=hidden
+    )
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_op_proto_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+        )
+    endif()
+    target_link_libraries(cust_op_proto_obj PRIVATE
+                            intf_pub
+                            exe_graph
+                            register
+                            tiling_api
+                            -Wl,--whole-archive
+                            rt2_registry
+                            -Wl,--no-whole-archive
+    )
+    add_library(cust_optiling_obj OBJECT ${ops_srcs})
+    target_compile_definitions(cust_optiling_obj PRIVATE OP_TILING_LIB)
+    target_compile_options(cust_optiling_obj PRIVATE 
+                        -fvisibility=hidden
+    )
+    target_include_directories(cust_optiling_obj PRIVATE ${ATVC_PATH})
+    if(ENABLE_CROSS_COMPILE)
+        target_link_directories(cust_optiling_obj PRIVATE
+                                ${CMAKE_COMPILE_COMPILER_LIBRARY}
+                                ${CMAKE_COMPILE_RUNTIME_LIBRARY}
+        )
+    endif()
+
+    target_link_libraries(cust_optiling_obj PRIVATE
+                        intf_pub
+                        exe_graph
+                        register
+                        tiling_api
+                        -Wl,--whole-archive
+                        rt2_registry
+                        -Wl,--no-whole-archive
+    )
+    target_compile_options(cust_opapi PRIVATE -DLOG_CPP)
+    target_include_directories(cust_opapi INTERFACE ${ATVC_PATH})
+    target_include_directories(cust_opapi INTERFACE ${CMAKE_SOURCE_DIR}/build_out/binary/)
+    target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase cust_optiling_obj cust_op_proto_obj ascend_opregistry ascend_kernels)
+    add_dependencies(cust_opapi ascend_opregistry)
+endif()
+
+add_custom_target(optiling_compat ALL 
+                 COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$<TARGET_FILE_NAME:cust_optiling>
+                         ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+)
+
+if(NOT ASCEND_PACK_SHARED_LIBRARY)
+        install(TARGETS cust_op_proto
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+                DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        file(GLOB GROUP_PROTO_HEADERS ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        if (GROUP_PROTO_HEADERS)
+                install(FILES ${GROUP_PROTO_HEADERS}
+                        DESTINATION packages/vendors/${vendor_name}/op_proto/inc)
+        endif()
+        install(TARGETS cust_optiling
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR})
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so
+                DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling)
+        install(TARGETS cust_opapi
+                LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib)
+        install(FILES ${aclnn_inc}
+                DESTINATION packages/vendors/${vendor_name}/op_api/include)
+else()
+        file(GLOB group_inc ${ASCEND_AUTOGEN_PATH}/group_proto/*.h)
+        install(TARGETS cust_opapi
+            LIBRARY DESTINATION op_api/lib)
+        install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h
+            DESTINATION op_api/include)
+        install(FILES ${group_inc}
+            DESTINATION op_api/include)
+        install(FILES ${aclnn_inc}
+            DESTINATION op_api/include)
+endif()
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df23786935faa9f8f689dff799b3362a395bfa3c
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_host/reduce_sum_custom.cpp
@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "reduce/reduce_host.h"
+#include "register/op_def_registry.h"
+
+using ReduceOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+using ReduceOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
+
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext *context)
+{
+    ATVC::ReducePolicy policy = {0, 0, 0};
+    auto inputShape0 = context->GetInputShape(0)->GetOriginShape();
+    std::vector<int64_t> shapeIn;
+    for (int32_t i = 0; i < inputShape0.GetDimNum(); i++) {
+        shapeIn.push_back(inputShape0.GetDim(i));
+    }
+    // 获取dim值
+    const gert::RuntimeAttrs *runtimeAttrs = context->GetAttrs();
+    const gert::TypedContinuousVector<int64_t> *attr0 = runtimeAttrs->GetListInt(0);
+    const int64_t *arr = reinterpret_cast<const int64_t *>(attr0->GetData());    
+    std::vector<int64_t> dim(arr, arr + attr0->GetSize());
+    ATVC::ReduceParam *tiling = context->GetTilingData<ATVC::ReduceParam>();
+    if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_FLOAT) {
+         // ReduceOpTraitsFloat为Reduce算子描述原型，根据算子输入shape和dim计算出Tiling数据后填入tiling中
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsFloat>(shapeIn, dim, &policy, tiling);
+    } else if (context->GetInputDesc(0)->GetDataType() == ge::DataType::DT_INT32) {
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsInt>(shapeIn, dim, &policy, tiling);
+    }
+    // 设置tiling的policyId为policy的id
+    tiling->policyId = policy.getID();
+    context->SetBlockDim(tiling->tilingData.coreNum);
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+    return ge::GRAPH_SUCCESS;
+}
+}  // namespace optiling
+
+namespace ge {
+static graphStatus InferShape(gert::InferShapeContext *context)
+{
+    const gert::Shape *x1_shape = context->GetInputShape(0);
+    gert::Shape *y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+
+static graphStatus InferDataType(gert::InferDataTypeContext *context)
+{
+    const auto inputDataType = context->GetInputDataType(0);
+    context->SetOutputDataType(0, inputDataType);
+    return ge::GRAPH_SUCCESS;
+}
+}  // namespace ge
+
+namespace ops {
+class ReduceSumCustom : public OpDef {
+public:
+    explicit ReduceSumCustom(const char *name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("y")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_FLOAT, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Attr("dim").AttrType(REQUIRED).ListInt();
+
+        this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType);
+        this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b");
+    }
+};
+OP_ADD(ReduceSumCustom);
+}  // namespace ops
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1a719a60cf480bf01d15b9d1905038536d5e5ccb
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_VERSION_MAKEFILE ON)
+if(DEFINED ENV{ATVC_PATH})
+    set(ATVC_PATH $ENV{ATVC_PATH})
+else()
+    set(ATVC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../../../include")
+endif()
+if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx")
+    add_ops_compile_options(ALL OPTIONS -g -O0)
+endif()
+
+add_ops_compile_options(ALL OPTIONS -g -O0 --cce-aicore-block-local-init -I ${ATVC_PATH})
+add_kernels_compile()
\ No newline at end of file
diff --git a/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50a5c73f975c464282ae32cc46da52666ca19529
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/ReduceSumCustom/op_kernel/reduce_sum_custom.cpp
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "reduce/reduce_device.h"
+
+using ReduceOpTraits = ATVC::OpTraits<ATVC::OpInputs<DTYPE_X>, ATVC::OpOutputs<DTYPE_Y>>;
+
+extern "C" __global__ __aicore__ void reduce_sum_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);
+    REGISTER_TILING_DEFAULT(ATVC::ReduceParam);
+    GET_TILING_DATA(param, tiling);
+    if (param.policyId == ATVC::REDUCE_POLICY0.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY0>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY1.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY1>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY2.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY2>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY3.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY3>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY4.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY4>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY5.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY5>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY6.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY6>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY7.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY7>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY8.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY8>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY9.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY9>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY10.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY10>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY11.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY11>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY12.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY12>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY13.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY13>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY14.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY14>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY15.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY15>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY16.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY16>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY17.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY17>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY18.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY18>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY19.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY19>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY20.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY20>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY21.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY21>();
+        op.Run(x, y, tiling);
+    } else if (param.policyId == ATVC::REDUCE_POLICY22.ID) {
+        auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<ReduceOpTraits>, ATVC::REDUCE_POLICY22>();
+        op.Run(x, y, tiling);
+    }
+}
diff --git a/atvc/examples/aclnn/reduce_sum/install.sh b/atvc/examples/aclnn/reduce_sum/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..afe455ade5b53bd16ac6b4206650232bf32cf8e2
--- /dev/null
+++ b/atvc/examples/aclnn/reduce_sum/install.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+SHORT=v:,i:,
+LONG=soc-version:,install-path:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+source $_ASCEND_INSTALL_PATH/bin/setenv.bash
+export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH
+
+OP_NAME=ReduceSumCustom
+rm -rf CustomOp
+# Generate the op framework
+msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp
+# Copy op implementation files to CustomOp
+cp -rf $OP_NAME/* CustomOp
+# Delete tiling.h
+rm -rf CustomOp/op_host/*._tiling.h
+# Build CustomOp project
+(cd CustomOp && bash build.sh)
\ No newline at end of file
diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp
index 04ea4a22766471183cb00a790918d83d7c443c16..f5a410a114299bbff4b620dd15bce57201812688 100644
--- a/atvc/examples/add/add.cpp
+++ b/atvc/examples/add/add.cpp
@@ -15,7 +15,8 @@
 #include <iostream>
 #include <algorithm>
 #include "acl/acl.h"
-#include "atvc.h"
+#include "elewise/elewise_host.h"
+#include "elewise/elewise_device.h" 
 
 #define CHECK_ACL(x)                                                                        \
     do {                                                                                    \
@@ -36,7 +37,7 @@ bool IsClose(float a, float b)
     float diff = std::abs(a - b);
     return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
 }
-}
+
 // Add算子中有两个输入，一个输出。类型均为float
 using ADD_OPTRAITS = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
 
@@ -57,6 +58,59 @@ struct AddComputeFunc {
     }
 };
 
+void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<float> &inputY, std::vector<float> &golden)
+{
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0f, 100.0f);
+
+    for (int i = 0; i < eleNum; ++i) {
+        inputX[i] = dis(gen);
+        inputY[i] = dis(gen);
+        golden[i] = inputX[i] + inputY[i];
+    }
+}
+
+bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
+{
+    for (int32_t i = 0; i < golden.size(); i++) {
+        if (!IsClose(golden[i], output[i])) {
+            printf("Accuracy verification failed! The expected value of element "
+                   "in index [%d] is %f, but actual value is %f.\n",
+                i,
+                golden[i],
+                output[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void CleanACL(aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice, uint8_t *&paramDevice)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFree(paramDevice));
+    CHECK_ACL(aclrtFreeHost(zHost));
+}
+}
+
 template<class Traits>
 /*
  * 该函数为Add算子核函数入口
@@ -79,38 +133,23 @@ int main()
     int32_t eleNum = 8 * 1024;
     size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
     size_t outputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-    
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(1.0f, 100.0f);
 
     std::vector<float> inputX(eleNum);
     std::vector<float> inputY(eleNum);
     std::vector<float> golden(eleNum);
+    InitializeData(eleNum, inputX, inputY, golden);
 
-    // 生成输入数据
-    for (int i = 0; i < eleNum; ++i) {
-        inputX[i] = (dis(gen));
-        inputY[i] = (dis(gen));
-    }
-    for (int i = 0; i < eleNum; ++i) {
-        golden[i] = (inputX[i]) + (inputY[i]);
-    }
-    printf("Generate golden data successfully.\n");
-    // 声明运行态参数param
-    ATVC::EleWiseParam param;
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int32_t deviceId = 0;
+    InitializeACL(context, stream, deviceId);
 
+    ATVC::EleWiseParam param;
     if (!ATVC::Host::CalcEleWiseTiling<ADD_OPTRAITS>(eleNum, param)) {
         printf("Elewise tiling error.\n");
         return -1;
     };
-    CHECK_ACL(aclInit(nullptr));
-    aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
-    aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
+    auto elementParamSize = sizeof(param);
 
     uint8_t *zHost;
     uint8_t *xDevice;
@@ -118,45 +157,30 @@ int main()
     uint8_t *zDevice;
     uint8_t *paramDevice;
 
-    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)&zHost, outputByteSize));
     CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&paramDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
     CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, inputY.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
-
-    auto elementParamSize = sizeof(param);
-    CHECK_ACL(aclrtMalloc((void**)&paramDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize,
-                          reinterpret_cast<uint8_t *>(&param), elementParamSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast<uint8_t *>(&param),
+              elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE));
     uint32_t blockNum = param.tilingData.blockNum;
     // 调用核函数
     AddCustom<ADD_OPTRAITS><<<blockNum, nullptr, stream>>>(xDevice, yDevice, zDevice, paramDevice);
 
     CHECK_ACL(aclrtSynchronizeStream(stream));
     CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    
-    std::vector<float> outputZ(reinterpret_cast<float*>(zHost), reinterpret_cast<float*>(zHost) + eleNum);
 
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFree(zDevice));
-    CHECK_ACL(aclrtFree(paramDevice));
-    CHECK_ACL(aclrtFreeHost(zHost));
+    std::vector<float> outputZ(reinterpret_cast<float *>(zHost), reinterpret_cast<float *>(zHost) + eleNum);
 
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
+    CleanUp(zHost, xDevice, yDevice, zDevice, paramDevice);
+    CleanACL(stream, deviceId);
 
-    for (int32_t i = 0; i < eleNum; i++) {
-        if (!IsClose(golden[i], outputZ[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                   i, golden[i], outputZ[i]);
-            return -1;
-        }
+    if (!VerifyResults(golden, outputZ)) {
+        return -1;
     }
     printf("Accuracy verification passed.\n");
     return 0;
diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp
index c7aa38993b6f87bfae09e4f5928b62034857ec5c..a2f60e3943e6918a333ad3d6cc6b8bfac91999d9 100644
--- a/atvc/examples/add_with_scalar/add_with_scalar.cpp
+++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp
@@ -15,7 +15,8 @@
 #include <iostream>
 #include <algorithm>
 #include "acl/acl.h"
-#include "atvc.h"
+#include "elewise/elewise_host.h"
+#include "elewise/elewise_device.h" 
 
 #define CHECK_ACL(x)                                                                        \
     do {                                                                                    \
@@ -36,7 +37,6 @@ bool IsClose(float a, float b)
     float diff = std::abs(a - b);
     return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
 }
-}
 
 using OP_TRAITS =  ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float>>;
 
@@ -55,6 +55,64 @@ struct AddComputeFunc {
     }
 };
 
+void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<float> &inputY, std::vector<float> &golden,
+    bool conditionVal)
+{
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0f, 100.0f);
+
+    for (int i = 0; i < eleNum; ++i) {
+            inputX[i] = dis(gen);
+            inputY[i] = dis(gen);
+            if (conditionVal) {
+                golden[i] = 2 * (inputX[i]) + (inputY[i]);  // z = 2 * x + y
+            } else {
+                golden[i] = 2 * (inputX[i]) - (inputY[i]);  // z = 2 * x - y
+            }
+    }
+}
+
+bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
+{
+    for (int32_t i = 0; i < golden.size(); i++) {
+            if (!IsClose(golden[i], output[i])) {
+                printf("Accuracy verification failed! The expected value of element "
+                       "in index [%d] is %f, but actual value is %f.\n",
+                    i,
+                    golden[i],
+                    output[i]);
+                return false;
+            }
+    }
+    return true;
+}
+
+void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void CleanACL(aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice, uint8_t *&paramDevice)
+{
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFree(paramDevice));
+    CHECK_ACL(aclrtFreeHost(zHost));
+}
+}
+
 /*
  * 该函数为AddCustom算子核函数入口
  * a                 Device上的gm地址，指向算子第一个输入
@@ -71,46 +129,31 @@ __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR pa
     op.Run(a, b, c, param, conditionVal);                                // 调用Run函数, 执行算子
 }
 
+
 int main()
 {
     int32_t eleNum = 8 * 1024;
     size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
     size_t outputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-    
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(1.0f, 100.0f);
 
     std::vector<float> inputX(eleNum);
     std::vector<float> inputY(eleNum);
     std::vector<float> golden(eleNum);
     bool conditionVal = false;
     // 生成输入数据
-    for (int i = 0; i < eleNum; ++i) {
-        inputX[i] = (dis(gen));
-        inputY[i] = (dis(gen));
-    }
-    for (int i = 0; i < eleNum; ++i) {
-        if (conditionVal) {
-            golden[i] = 2 * (inputX[i]) + (inputY[i]); // z = 2 * x + y
-        } else {
-            golden[i] = 2 * (inputX[i]) - (inputY[i]); // z = 2 * x - y
-        }
-    }
-    printf("Generate golden data successfully.\n");
-    ATVC::EleWiseParam param;
+    InitializeData(eleNum, inputX, inputY, golden, conditionVal);
 
-    if (!ATVC::Host::CalcEleWiseTiling<OP_TRAITS>(eleNum, param)) {
-        printf("Elewise tiling error.\n");
-        return -1;
-    };
-    CHECK_ACL(aclInit(nullptr));
     aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
     aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
+    int32_t deviceId = 0;
+    InitializeACL(context, stream, deviceId);
+
+    ATVC::EleWiseParam param;
+    if (!ATVC::Host::CalcEleWiseTiling<OP_TRAITS>(eleNum, param)) {
+            printf("Elewise tiling error.\n");
+            return -1;
+    };
+    auto elementParamSize = sizeof(param);
 
     uint8_t *zHost;
     uint8_t *xDevice;
@@ -118,45 +161,30 @@ int main()
     uint8_t *zDevice;
     uint8_t *paramDevice;
 
-    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)&zHost, outputByteSize));
     CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&paramDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
     CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, inputY.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast<uint8_t *>(&param),
+              elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
-    auto elementParamSize = sizeof(param);
-    CHECK_ACL(aclrtMalloc((void**)&paramDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize,
-                          reinterpret_cast<uint8_t *>(&param), elementParamSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
-
-    AddCustom<OP_TRAITS><<<param.tilingData.blockNum, nullptr, stream>>>(
-        xDevice, yDevice, zDevice, paramDevice, conditionVal);
+    AddCustom<OP_TRAITS>
+        <<<param.tilingData.blockNum, nullptr, stream>>>(xDevice, yDevice, zDevice, paramDevice, conditionVal);
 
     CHECK_ACL(aclrtSynchronizeStream(stream));
     CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
-    
-    std::vector<float> outputZ(reinterpret_cast<float*>(zHost), reinterpret_cast<float*>(zHost) + eleNum);
 
-    CHECK_ACL(aclrtFree(xDevice));
-    CHECK_ACL(aclrtFree(yDevice));
-    CHECK_ACL(aclrtFree(zDevice));
-    CHECK_ACL(aclrtFree(paramDevice));
-    CHECK_ACL(aclrtFreeHost(zHost));
+    std::vector<float> outputZ(reinterpret_cast<float *>(zHost), reinterpret_cast<float *>(zHost) + eleNum);
 
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
+    CleanUp(zHost, xDevice, yDevice, zDevice, paramDevice);
+    CleanACL(stream, deviceId);
 
-    for (int32_t i = 0; i < eleNum; i++) {
-        if (!IsClose(golden[i], outputZ[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                   i, golden[i], outputZ[i]);
+    if (!VerifyResults(golden, outputZ)) {
             return -1;
-        }
     }
     printf("Accuracy verification passed.\n");
     return 0;
diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp
index 9b4aa6df0a3aee5631b4370183258a5083e64fab..c3a0794413ded63fe04e34d679afeae1d02c9619 100644
--- a/atvc/examples/broadcast_to/broadcast_to.cpp
+++ b/atvc/examples/broadcast_to/broadcast_to.cpp
@@ -16,7 +16,8 @@
 #include <iostream>
 #include <algorithm>
 #include "acl/acl.h"
-#include "atvc.h"
+#include "broadcast/broadcast_host.h"
+#include "broadcast/broadcast_device.h"
 
 #define CHECK_ACL(x)                                                                        \
     do {                                                                                    \
@@ -36,10 +37,10 @@ bool IsClose(float a, float b) {
     float diff = std::abs(a - b);
     return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
 }
-}
 
 // BroadcastTo算子的描述：一个输入，一个输出，类型均为float
 using BroadcastOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+}
 
 /*
  * 该函数为BroadcastCustom算子核函数入口
@@ -56,7 +57,6 @@ __global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, GM_ADDR broadca
     op.Run(x, y, broadcastParam);
 }
 
-
 // 负责Broadcast类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
 template<class OpTraits>
 void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam &param, ATVC::BroadcastPolicy &policy, aclrtStream& stream)
@@ -113,7 +113,7 @@ int32_t main(int32_t argc, char* argv[])
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
     ATVC::BroadcastParam param;    // Broadcast运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::BroadcastPolicy policy;  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
+    ATVC::BroadcastPolicy policy = {-1, -1, -1};  // Broadcast运行态参数，负责映射最适合的Broadcast模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     if (!ATVC::Host::CalcBroadcastTiling<BroadcastOpTraits>(shapeIn, shapeOut, &policy, &param)) {
         printf("Broadcast tiling error.\n");
diff --git a/atvc/examples/pytorch/README.md b/atvc/examples/pytorch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..945054b730a2b53ca50fc14102e4bf89d65c174b
--- /dev/null
+++ b/atvc/examples/pytorch/README.md
@@ -0,0 +1,192 @@
+ ## 概述
+使用ATVC对接pytorch工程简单的示例，适合初学者。
+
+## pytorch算子样例说明
+样例通过Ascend C编程语言实现了ATVC框架对接pytorch算子，并按照算子调用方式分别给出了对应的端到端实现。
+
+## 算子开发样例
+|  目录名称                                                   |  功能描述                                              |
+| ------------------------------------------------------------ | ---------------------------------------------------- |
+| [add](./add) | 基于ATVC框架的Add自定义Vector算子 |
+| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子 | 
+
+## 快速上手<a name="codeready"></a>
+
+ 快速执行example用例，更详细的流程请参阅[add算子](./add/README.md)。
+
+- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/1_quick_start.md)。 
+
+- 导入ATVC环境变量
+    ```bash
+        # 如果不导入，默认使用./atvc/include路径
+        export ATVC_PATH=${atvc}/include
+    ```
+
+ - 执行add用例
+    ```bash
+        # 基于ATVC编译pytorch Add算子
+        $ cd ./atvc/examples/pytorch/add
+        $ bash run.sh
+        ...
+        OK
+    ```
+
+## 基于pytorch算子对接ATVC框架
+
+### 步骤1. 定义算子描述，参考[add_custom_impl.h](./add/add_custom_impl.h)
+
+    首先通过ATVC提供的`ATVC::OpTraits`模板结构体来描述Add算子的输入输出信息，定义如下：
+```cpp
+    // Add算子中有两个输入，一个输出。类型均为float
+    using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>；
+```
+
+### 步骤2. 实现算子计算逻辑，参考[add_custom_impl.h](./add/add_custom_impl.h)
+
+   用户需要通过AscendC API来搭建Add算子的核心计算逻辑，在ATVC框架中，这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数，并重载`operator()`来被提供的Kernel层算子模板类调用。
+
+```cpp
+    // 头文件引入
+    #include "elewise/elewise_host.h"
+    #include "elewise/elewise_device.h" 
+
+    // 传入编译态参数ATVC::OpTraits
+    template<typename Traits>
+    struct AddComputeFunc {
+        /*
+        函数说明： z = x + y
+        参数说明：
+            x                   : 参与运算的输入
+            y                   : 参与运算的输入
+            z                   : 参与运算的输出
+        */
+        template<typename T> 
+        // 重载operator，提供给算子模板类调用
+        __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
+            AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量
+        }
+    };
+```    
+### 步骤3.  实现核函数，参考[add_custom_impl.h](./add/add_custom_impl.h)
+    
+   ATVC提供的`ATVC::Kernel::EleWiseOpTemplate`算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化，用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算，完成完整核函数的实现。 在examples/add用例中，算子核函数的形式参数除了输入输出之外，还需额外传入`ATVC::EleWiseParam param`的形参,需要用户在初始化`ATVC::Kernel::EleWiseOpTemplate`时指定`ATVC::EleWiseParam*`的类型。该参数包含算子模板类进行数据搬运数据的必要参数，由`ATVC::Host::CalcEleWiseTiling` API计算得出。
+
+```cpp        
+    /*
+    * 该函数为Add算子核函数入口
+    * a        Device上的gm地址，指向Add算子第一个输入
+    * b        Device上的gm地址，指向Add算子第二个输入
+    * c        Device上的gm地址，指向Add算子第一个输出
+    * param    ATVC::EleWiseParam数据
+    */
+    template<class Traits>
+    __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, ATVC::EleWiseParam param)
+    {
+        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+        auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<Traits>, ATVC::EleWiseParam*>();  // 将AddComputeFunc仿函数作为模板参数传入，实例化EleWiseOpTemplate模板类,并指明param的数据类型
+        op.Run(x, y, z, &param); // 按照输入、输出、param的顺序传入Run函数，实现GM->GM的数据计算
+    } 
+```
+   备注：reduce类算子在初始化的时候，` ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>, Policy, void, void,  ATVC::ReduceParam*>`时指定`ATVC::ReduceParam*`的类型；broadcast类算子在初始化的时候，` ATVC::Kernel::ATVC::Kernel::BroadcastOpTemplate<ATVC::BroadcastCompute<Traits>, Policy, ATVC::BroadcastParam*>`时指定`ATVC::BroadcastParam*`的类型。
+
+ ### 步骤4. 编写torch入口[pytorch_ascendc_extension.cpp](./add/pytorch_ascendc_extension.cpp)
+
+  不同的算子类型可参考[快速入门](../../docs/1_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数，更多ATVC的用法可参考atvc的[开发指南](../../docs/2_developer_guide.md)。
+
+``` cpp
+    // 头文件引入
+    #include <torch/extension.h>
+    #include "torch_npu/csrc/core/npu/NPUStream.h"
+    #include "add_custom_impl.h"
+
+    namespace ascendc_elewise_ops {
+    at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y)
+    {
+        auto stream = c10_npu::getCurrentNPUStream().stream(false);
+        at::Tensor z = at::empty_like(x);
+        // 创建ATVC框架[elewise]所需要的数据，算子数据的长度
+        int32_t totalLength = 1;
+        for (int32_t size : x.sizes()) {
+            totalLength *= size;
+        }
+        // 声明运行态参数param
+        ATVC::EleWiseParam param;
+        // Host侧调用Tiling API完成相关运行态参数的运算
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
+        // 调用核函数
+        AddCustom<AddOpTraitsFloat><<<param.tilingData.blockNum, nullptr, stream>>>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+        
+        return z;
+    }
+    // 加载算子模版
+    TORCH_LIBRARY(ascendc_ops, m)
+    {   
+        // torch的自定义算子接口
+        m.def("add", &ascendc_elewise_ops::op_add_custom);
+    }
+    }     
+```
+### 步骤5. 编写测试用例，参考[run_op.py](./add/run_op.py)
+```python
+    # 导入torch torch所需依赖
+    import torch
+    import torch_npu
+    # 导入测试用例依赖
+    from torch_npu.testing.testcase import TestCase, run_tests
+    torch.npu.config.allow_internal_format = False
+    # 加载bishengcc编译出的二进制文件
+    torch.ops.load_library('./libascendc_pytorch.so')
+
+    # 测试用例编写
+    class TestAscendCOps(TestCase):
+        # 测试用例
+        def test_add_custom_ops_float(self):
+            length = [8, 2048]
+            # 生成随机数
+            x = torch.rand(length, device='cpu', dtype=torch.float32)
+            y = torch.rand(length, device='cpu', dtype=torch.float32)
+            # 调用torch的自定义算子接口
+            npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
+            cpuout = torch.add(x, y)
+            self.assertRtolEqual(npuout, cpuout)   
+
+    if __name__ == '__main__':
+        run_tests()
+```
+### 步骤6.  基于atvc框架pytorch的编译和测试脚本[run.sh](./add/run.sh)
+```bash   
+    # 获取torch、torch_npu、python的lib和include路径和atvc的路径       
+    torch_location=...
+    torch_npu_location=...
+    python_include=...
+    python_lib=... 
+    atvc_path=...
+
+    # 使用bishengcc进行编译pytorch算子
+    bishengcc pytorch_ascendc_extension.cpp \
+        -arch Ascend910B1   \
+        -I${torch_location}/include   \
+        -I${torch_location}/include/torch/csrc/api/include   \
+        -I${python_include}   \
+        -I${atvc_path}   \
+        -I${torch_npu_location}/include   \
+        -L${torch_location}/lib   \
+        -L${torch_npu_location}/lib   \
+        -L${python_lib}   \
+        -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
+        -o libascendc_pytorch.so   \
+        -shared
+
+    # 执行测试用例
+    python3 run_op.py
+```
+
+### 步骤7. 算子编译&执行
+```bash
+    # 基于ATVC编译pytorch Add算子
+    $ cd ./atvc/examples/pytorch/add
+    $ bash run.sh
+    ...
+    OK
+```
diff --git a/atvc/examples/pytorch/add/README.md b/atvc/examples/pytorch/add/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5df331b32080080cb36959de9ae11e4f1adf8b9d
--- /dev/null
+++ b/atvc/examples/pytorch/add/README.md
@@ -0,0 +1,153 @@
+  ## 概述
+本样例基于AddCustom算子工程，介绍了基于ATVC的单算子工程、单算子调用。
+
+## 目录结构介绍
+```
+├── add                                   
+│   ├── add_custom_impl.h                 // 通过pytroch调用的方式调用Add算子
+│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
+│   ├── run_op.py                         // pytorch的测试用例
+│   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">Add</td></tr>
+</tr>
+<tr><td rowspan="3" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+<tr><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">z</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">AddCustom</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品 
+
+## 算子工程介绍
+其中，算子工程目录AddCustom包含算子的实现文件，如下所示：
+```
+├── add                                   
+│   ├── add_custom_impl.h                 // 通过pytroch调用的方式调用Add算子
+│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
+│   ├── run_op.py                         // pytorch的测试用例
+│   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
+```
+
+run.sh脚本编译pytorch所需要的算子依赖，并将执行测试用例。
+
+## 编译运行样例算子
+针对自定义算子工程，编译运行包含如下步骤：
+- 完成算子pytorch入口和impl文件的实现；
+- 编译pytorch算子的二进制文件；
+- 调用执行pytorch算子；
+
+详细操作如下所示。
+### 1. 获取源码包及环境配置
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
+### 1. 安装pytorch环境
+参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境
+
+### 2. 基于ATVC编写pytorch算子的实现<a name="operatorcompile"></a>
+  - 编写kernel侧函数,参考[add_custom_impl.h](./add_custom_impl.h)
+    ```cpp
+    // 引入头文件
+    #include "elewise/elewise_host.h"
+    #include "elewise/elewise_device.h"
+    
+    // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息，定义如下
+    using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
+    using AddOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int, int>, ATVC::OpOutputs<int>>;
+
+    // 实现算子计算逻辑
+    // 传入编译态参数ATVC::OpTraits
+    template<typename Traits>
+    struct AddComputeFunc {
+        /*
+        函数说明： z = x + y
+        参数说明：
+            x                   : 参与运算的输入
+            y                   : 参与运算的输入
+            z                   : 参与运算的输出
+        */
+        template<typename T> 
+        // 重载operator，提供给算子模板类调用
+        __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
+            AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量
+        }
+    };
+
+    //实现核函数
+    template <class Traits>
+    __global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param)
+    {
+        KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+        // 将AddComputeFunc仿函数作为模板参数传入，实例化EleWiseOpTemplate模板类
+        auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<Traits>, ATVC::EleWiseParam*>();
+        op.Run(x, y, z, &param);
+    }
+    ```
+  - 编写pytorch入口函数,并调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
+    ```cpp
+    at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y)
+    {
+        auto stream = c10_npu::getCurrentNPUStream().stream(false);
+        ATVC::EleWiseParam param;
+        int32_t totalLength = 1;
+        for (int32_t size : x.sizes()) {
+            totalLength *= size;
+        }
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
+        AddCustom<AddOpTraitsInt><<<param.tilingData.blockNum, nullptr, stream>>>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+    }
+    ```
+  - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py)
+
+    ```py
+    # 引入头文件
+    import torch
+    import torch_npu
+    import numpy as np
+    from torch_npu.testing.testcase import TestCase, run_tests
+    # 加载二进制
+    torch.npu.config.allow_internal_format = False
+    torch.ops.load_library('./libascendc_pytorch.so')
+
+    class TestAscendCOps(TestCase):
+      # 测试用例
+      def test_add_custom_ops_float(self):
+          length = [8, 2048]    
+          x = torch.rand(length, device='cpu', dtype=torch.float32)
+          y = torch.rand(length, device='cpu', dtype=torch.float32)
+          npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
+          cpuout = torch.add(x, y)
+          self.assertRtolEqual(npuout, cpuout)
+
+    # 测试用例调用    
+    if __name__ == '__main__':
+      run_tests()
+      ```
+
+### 3. 基于ATVC编写pytorch算子的调用验证
+  - 调用脚本，生成pytorch算子，并运行测试用例 
+      ```bash
+      $ cd ./atvc/examples/pytorch/add
+      $ bash run.sh
+      ...
+      OK
+      ```    
+
+## 更新说明
+| 时间       | 更新事项                     |
+| ---------- | ---------------------------- |
+| 2025/07/24 | 新增readme                   |
diff --git a/atvc/examples/pytorch/add/add_custom_impl.h b/atvc/examples/pytorch/add/add_custom_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..86a6912caaaa3264324a68740fa98a70fcb48566
--- /dev/null
+++ b/atvc/examples/pytorch/add/add_custom_impl.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ADD_CUSTOM_IMPL_H
+#define ADD_CUSTOM_IMPL_H
+#include "elewise/elewise_host.h"
+#include "elewise/elewise_device.h"
+
+using AddOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float, float>, ATVC::OpOutputs<float>>;
+using AddOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int, int>, ATVC::OpOutputs<int>>;
+
+// 传入编译态参数ATVC::OpTraits
+template<typename Traits>
+struct AddComputeFunc {
+    /*
+    函数说明： z = x + y
+    参数说明：
+        x                   : 参与运算的输入
+        y                   : 参与运算的输入
+        z                   : 参与运算的输出
+    */
+    template<typename T> 
+    // 重载operator，提供给算子模板类调用
+    __aicore__ inline void operator()(AscendC::LocalTensor<T> x, AscendC::LocalTensor<T> y, AscendC::LocalTensor<T> z) {
+        AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量
+    }
+};
+
+/*
+ * 该函数为Add算子核函数入口
+ * x        Device上的gm地址，指向Add算子第一个输入
+ * y        Device上的gm地址，指向Add算子第二个输入
+ * z        Device上的gm地址，指向Add算子第一个输出
+ * param    ATVC::EleWiseParam数据
+ */
+template <class Traits>
+__global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY);
+    // 将AddComputeFunc仿函数作为模板参数传入，实例化EleWiseOpTemplate模板类
+    auto op = ATVC::Kernel::EleWiseOpTemplate<AddComputeFunc<Traits>, ATVC::EleWiseParam*>();
+    op.Run(x, y, z, &param);
+}
+#endif
diff --git a/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp b/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bc6444720e21762fbe052cbdff90c7225c77592b
--- /dev/null
+++ b/atvc/examples/pytorch/add/pytorch_ascendc_extension.cpp
@@ -0,0 +1,46 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "add_custom_impl.h"
+
+namespace ascendc_elewise_ops {
+at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y)
+{
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+    at::Tensor z = at::empty_like(x);
+    int32_t totalLength = 1;
+    for (int32_t size : x.sizes()) {
+        totalLength *= size;
+    }
+    // 声明运行态参数param
+    ATVC::EleWiseParam param;
+    if (x.scalar_type() == at::kFloat) {
+        // Host侧调用Tiling API完成相关运行态参数的运算
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsFloat>(totalLength, param);
+        // 调用核函数
+        AddCustom<AddOpTraitsFloat><<<param.tilingData.blockNum, nullptr, stream>>>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+    } else if (x.scalar_type() == at::kInt) {
+        (void)ATVC::Host::CalcEleWiseTiling<AddOpTraitsInt>(totalLength, param);
+        // 调用核函数
+        AddCustom<AddOpTraitsInt><<<param.tilingData.blockNum, nullptr, stream>>>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param);
+    }
+    return z;
+}
+
+TORCH_LIBRARY(ascendc_ops, m)
+{
+    m.def("add", &ascendc_elewise_ops::op_add_custom);
+}
+}  // namespace ascendc_elewise_ops
\ No newline at end of file
diff --git a/atvc/examples/pytorch/add/run.sh b/atvc/examples/pytorch/add/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2650c7dc98f5ea7b22a49b1a57a1c9d9b64a5d0c
--- /dev/null
+++ b/atvc/examples/pytorch/add/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+torch_location=$(python3 -c "import torch; print(torch.__path__[0])")
+torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])")
+python_include=$(python3 -c "import sysconfig; print(sysconfig.get_path('include'))")
+python_lib=$(python3 -c "import sysconfig; print(sysconfig.get_path('stdlib'))")
+export LD_LIBRARY_PATH=${torch_npu_location}/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${torch_location}/lib/:$LD_LIBRARY_PATH
+if [ -z "$ATVC_PATH" ]; then
+    atvc_path=$(realpath ../../../include)
+else
+    atvc_path=$ATVC_PATH
+fi
+
+rm -rf *.json
+rm -rf libascendc_pytorch.so
+
+
+bishengcc pytorch_ascendc_extension.cpp \
+    -arch Ascend910B1   \
+    -I${torch_location}/include   \
+    -I${torch_location}/include/torch/csrc/api/include   \
+    -I${python_include}   \
+    -I${atvc_path}   \
+    -I${torch_npu_location}/include   \
+    -L${torch_location}/lib   \
+    -L${torch_npu_location}/lib   \
+    -L${python_lib}   \
+    -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
+    -o libascendc_pytorch.so   \
+    -shared
+
+python3 run_op.py
+
+if [ $? -ne 0 ]; then
+    echo "ERROR: verify result failed! the result is wrong!"
+    return 1
+fi
+
+rm -rf *.json
+rm -rf libascendc_pytorch.so
\ No newline at end of file
diff --git a/atvc/examples/pytorch/add/run_op.py b/atvc/examples/pytorch/add/run_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ee702116e83f8f13ec22406c3090f1e17529ae
--- /dev/null
+++ b/atvc/examples/pytorch/add/run_op.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+#
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+torch.npu.config.allow_internal_format = False
+torch.ops.load_library('./libascendc_pytorch.so')
+
+
+class TestAscendCOps(TestCase):
+
+    def test_add_custom_ops_float(self):
+        length = [8, 2048]
+        x = torch.rand(length, device='cpu', dtype=torch.float32)
+        y = torch.rand(length, device='cpu', dtype=torch.float32)
+        npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
+        cpuout = torch.add(x, y)
+        self.assertRtolEqual(npuout, cpuout)   
+
+    def test_add_custom_ops_int(self):
+        length = [8, 2048] 
+        x = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32)
+        y = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32) 
+        npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu())
+        cpuout = torch.add(x, y)
+        self.assertRtolEqual(npuout, cpuout)  
+
+if __name__ == '__main__':
+    run_tests()
\ No newline at end of file
diff --git a/atvc/examples/pytorch/reduce_sum/README.md b/atvc/examples/pytorch/reduce_sum/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5115295f6cb02bfc8665dc82c9e0faa4172f104
--- /dev/null
+++ b/atvc/examples/pytorch/reduce_sum/README.md
@@ -0,0 +1,177 @@
+  ## 概述
+本样例基于ReduceSum算子，介绍了基于ATVC的单算子工程、单算子调用。
+
+## 目录结构介绍
+```
+├── reduce_sum                                   
+│   ├── reduce_sum_impl.h                 // 通过pytroch调用的方式调用ReduceSum算子
+│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
+│   ├── run_op.py                         // pytorch的测试用例
+│   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
+```
+
+## 算子描述
+Add算子实现了两个数据相加，返回相加结果的功能。对应的数学表达式为：
+```
+z = x + y
+```
+## 算子规格描述
+<table>
+<tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="4" align="center">ReduceSum</td></tr>
+</tr>
+<tr><td rowspan="2" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">x</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+ 
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">y</td><td align="center">8 * 2048</td><td align="center">float,int</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">ReduceSumCustom</td></tr>
+</table>
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas A2训练系列产品 
+
+## 算子工程介绍
+其中，算子工程目录ReduceSum包含算子的实现文件，如下所示：
+```
+├── reduce_sum                                   
+│   ├── reduce_sum_impl.h                 // 通过pytroch调用的方式调用ReduceSum算子
+│   ├── pytorch_ascendc_extension.cpp     // pytorch调用入口
+│   ├── run_op.py                         // pytorch的测试用例
+│   └── run.sh                            // 脚本，编译需要的二进制文件，并测试
+```
+
+run.sh脚本编译pytorch所需要的算子依赖，并将执行测试用例。
+
+## 编译运行样例算子
+针对自定义算子工程，编译运行包含如下步骤：
+- 完成算子pytorch入口和impl文件的实现；
+- 编译pytorch算子的二进制文件；
+- 调用执行pytorch算子；
+
+详细操作如下所示。
+### 1. 获取源码包及环境配置
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。
+### 1. 安装pytorch环境
+参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境
+
+### 2. 基于ATVC编写pytorch算子的实现<a name="operatorcompile"></a>
+  - 编写kernel侧函数,参考[reduce_sum_impl.h](./reduce_sum_impl.h)
+    ```cpp
+    // 引入头文件
+    #include "reduce/reduce_host.h"
+    #include "reduce/reduce_device.h"
+
+    // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息，定义如下：
+    using ReduceOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+    using ReduceOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
+
+    /*
+    * 该函数为ReduceSumCustom算子核函数入口
+    * x                 Device上的gm地址，指向Add算子第一个输入
+    * y                 Device上的gm地址，指向Add算子第一个输出
+    * reduceParam       ATVC::ReduceParam
+    */
+    template <class Traits>
+    __global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::ReduceParam param)
+    {
+      KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);  // 使用了多核控制指令，设置算子执行时只启动Vector核
+      // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+      // ATVC::ReduceParam* 为tiling的类型
+      auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>,
+          Policy,
+          void,
+          void,
+          ATVC::ReduceParam*>();
+      op.Run(x, y, &reduceParam);
+    }
+    ```
+  - 编写pytorch入口函数,并调用kernel侧函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp)
+    ```cpp
+    // pytorch 入口函数
+    at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector<int64_t> &dim)
+    {
+      std::vector<int64_t> shapeIn;
+      std::vector<int64_t> shapeOut;
+      ATVC::ReduceParam param;  // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+      ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+      for (int32_t size : x.sizes()) {
+          shapeIn.push_back(size);
+          shapeOut.push_back(size);
+      }
+      for (const auto &i : dim) {
+          shapeOut[i] = 1;
+      }
+      auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device());
+      at::Tensor y = at::empty(shapeOut, options);
+      if (x.scalar_type() == at::kFloat) {
+          // Host侧调用Tiling API完成相关运行态参数的运算
+          (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsFloat>(shapeIn, dim, &policy, &param);
+          // 调用Adapter调度接口，完成核函数的模板调用
+          AscendC::ReduceOpAdapter<ReduceOpTraitsFloat>(
+              (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy);
+      } else if (x.scalar_type() == at::kInt) {
+          // int 类型调用
+           ...
+      }
+      return y;
+      }
+    // 调用
+    namespace AscendC {
+    inline namespace reduce {
+    template <class OpTraits>
+      // 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+      void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy)
+      {
+          auto stream = c10_npu::getCurrentNPUStream().stream(false);
+
+          // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+          if (policy == ATVC::REDUCE_POLICY0) {
+              ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+           }else if { //其他policy逻辑
+              ...
+           }
+      }
+      }  // namespace reduce
+      }  // namespace AscendC
+    ```
+  - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py)
+
+    ```python
+    # 引入头文件
+    import torch
+    import torch_npu
+    import numpy as np
+    from torch_npu.testing.testcase import TestCase, run_tests
+    # 加载二进制
+    torch.npu.config.allow_internal_format = False
+    torch.ops.load_library('./libascendc_pytorch.so')
+
+    class TestAscendCOps(TestCase):
+      # 测试用例
+      def test_reduce_sum_ops_float(self):
+          length = [8, 2048] 
+          x = torch.rand(length, device='cpu', dtype=torch.float32) 
+          npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,))
+          cpuout = torch.sum(x, (0,))
+          self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) 
+
+    # 测试用例调用    
+    if __name__ == '__main__':
+      run_tests()
+      ```
+
+### 3. 基于ATVC编写pytorch算子的调用验证
+  - 调用脚本，生成pytorch算子，并运行测试用例 
+    ```bash
+    $ cd ./atvc/examples/pytorch/reduce_sum
+    $ bash run.sh
+    ...
+    OK
+    ```    
+
+## 更新说明
+| 时间       | 更新事项                     |
+| ---------- | ---------------------------- |
+| 2025/07/24 | 新增readme                   |
diff --git a/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp b/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86c275e47b77584abefa7ba62085dc3b472ab165
--- /dev/null
+++ b/atvc/examples/pytorch/reduce_sum/pytorch_ascendc_extension.cpp
@@ -0,0 +1,112 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <torch/extension.h>
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "reduce_sum_impl.h"
+
+namespace AscendC {
+inline namespace reduce {
+// 负责Reduce类算子的调度，选择对应的Policy最佳策略并执行Kernel函数
+template <class OpTraits>
+void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy)
+{
+    auto stream = c10_npu::getCurrentNPUStream().stream(false);
+
+    // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+    if (policy == ATVC::REDUCE_POLICY0) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY0><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY1) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY1><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY2) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY2><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY3) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY3><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY4) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY4><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY5) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY5><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY6) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY6><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY7) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY7><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY8) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY8><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY9) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY9><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY10) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY10><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY11) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY11><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY12) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY12><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY13) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY13><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY14) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY14><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY15) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY15><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY16) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY16><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY17) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY17><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY18) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY18><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY19) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY19><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY20) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY20><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY21) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY21><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    } else if (policy == ATVC::REDUCE_POLICY22) {
+        ReduceSumCustom<OpTraits, ATVC::REDUCE_POLICY22><<<param.tilingData.coreNum, nullptr, stream>>>(x, y, param);
+    }
+}
+}  // namespace reduce
+}  // namespace AscendC
+
+namespace ascendc_reduce_ops {
+at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector<int64_t> &dim)
+{
+    std::vector<int64_t> shapeIn;
+    std::vector<int64_t> shapeOut;
+    ATVC::ReduceParam param;  // Reduce运行态参数，包含TilingData以及临时空间的相关信息
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    for (int32_t size : x.sizes()) {
+        shapeIn.push_back(size);
+        shapeOut.push_back(size);
+    }
+    for (const auto &i : dim) {
+        shapeOut[i] = 1;
+    }
+    auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device());
+    at::Tensor y = at::empty(shapeOut, options);
+    if (x.scalar_type() == at::kFloat) {
+        // Host侧调用Tiling API完成相关运行态参数的运算
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsFloat>(shapeIn, dim, &policy, &param);
+        // 调用Adapter调度接口，完成核函数的模板调用
+        AscendC::ReduceOpAdapter<ReduceOpTraitsFloat>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy);
+    } else if (x.scalar_type() == at::kInt) {
+        // Host侧调用Tiling API完成相关运行态参数的运算
+        (void)ATVC::Host::CalcReduceTiling<ReduceOpTraitsInt>(shapeIn, dim, &policy, &param);
+        // 调用Adapter调度接口，完成核函数的模板调用
+        AscendC::ReduceOpAdapter<ReduceOpTraitsInt>(
+            (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), param, policy);
+    }
+    return y;
+}
+
+TORCH_LIBRARY(ascendc_ops, m)
+{
+    m.def("sum", &ascendc_reduce_ops::op_reduce_sum);
+}
+}  // namespace ascendc_reduce_ops
\ No newline at end of file
diff --git a/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h b/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..670215c7ba865e50c966ae76ee1a26364138d0a4
--- /dev/null
+++ b/atvc/examples/pytorch/reduce_sum/reduce_sum_impl.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef REDUCE_SUM_CUSTOM_IMPL_H
+#define REDUCE_SUM_CUSTOM_IMPL_H
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+
+using ReduceOpTraitsFloat = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
+using ReduceOpTraitsInt = ATVC::OpTraits<ATVC::OpInputs<int32_t>, ATVC::OpOutputs<int32_t>>;
+
+/*
+ * 该函数为ReduceSumCustom算子核函数入口
+ * x                 Device上的gm地址，指向Add算子第一个输入
+ * y                 Device上的gm地址，指向Add算子第一个输出
+ * reduceParam       ATVC::ReduceParam
+ */
+template <typename Traits, const auto &Policy>
+__global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam)
+{
+    KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0);  // 使用了多核控制指令，设置算子执行时只启动Vector核
+    // 将计算模板类模板定义作为模板参数传入，Policy由Host层的策略分派API给出
+    auto op = ATVC::Kernel::ReduceOpTemplate<ATVC::ReduceSumCompute<Traits>,
+        Policy,
+        void,
+        void,
+        ATVC::ReduceParam*>();
+    op.Run(x, y, &reduceParam);
+}
+
+#endif
diff --git a/atvc/examples/pytorch/reduce_sum/run.sh b/atvc/examples/pytorch/reduce_sum/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2650c7dc98f5ea7b22a49b1a57a1c9d9b64a5d0c
--- /dev/null
+++ b/atvc/examples/pytorch/reduce_sum/run.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+torch_location=$(python3 -c "import torch; print(torch.__path__[0])")
+torch_npu_location=$(python3 -c "import torch_npu; print(torch_npu.__path__[0])")
+python_include=$(python3 -c "import sysconfig; print(sysconfig.get_path('include'))")
+python_lib=$(python3 -c "import sysconfig; print(sysconfig.get_path('stdlib'))")
+export LD_LIBRARY_PATH=${torch_npu_location}/lib/:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=${torch_location}/lib/:$LD_LIBRARY_PATH
+if [ -z "$ATVC_PATH" ]; then
+    atvc_path=$(realpath ../../../include)
+else
+    atvc_path=$ATVC_PATH
+fi
+
+rm -rf *.json
+rm -rf libascendc_pytorch.so
+
+
+bishengcc pytorch_ascendc_extension.cpp \
+    -arch Ascend910B1   \
+    -I${torch_location}/include   \
+    -I${torch_location}/include/torch/csrc/api/include   \
+    -I${python_include}   \
+    -I${atvc_path}   \
+    -I${torch_npu_location}/include   \
+    -L${torch_location}/lib   \
+    -L${torch_npu_location}/lib   \
+    -L${python_lib}   \
+    -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python   \
+    -o libascendc_pytorch.so   \
+    -shared
+
+python3 run_op.py
+
+if [ $? -ne 0 ]; then
+    echo "ERROR: verify result failed! the result is wrong!"
+    return 1
+fi
+
+rm -rf *.json
+rm -rf libascendc_pytorch.so
\ No newline at end of file
diff --git a/atvc/examples/pytorch/reduce_sum/run_op.py b/atvc/examples/pytorch/reduce_sum/run_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c433a3abeb8786f913116d6d64c3dc91755a8ff4
--- /dev/null
+++ b/atvc/examples/pytorch/reduce_sum/run_op.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+#
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+
+import torch
+import torch_npu
+from torch_npu.testing.testcase import TestCase, run_tests
+
+torch.npu.config.allow_internal_format = False
+torch.ops.load_library('./libascendc_pytorch.so')
+
+
+class TestAscendCOps(TestCase):
+
+    def test_reduce_sum_ops_float(self):
+        length = [8, 2048] 
+        x = torch.rand(length, device='cpu', dtype=torch.float32) 
+        npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,))
+        cpuout = torch.sum(x, (0,))
+        self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) 
+
+    def test_reduce_sum_ops_int(self):
+        length = [8, 2048] 
+        x = torch.randint(-10, 10, length, device='cpu', dtype=torch.int32)
+        npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,))
+        cpuout = torch.sum(x, (0,), dtype=torch.int32)
+        self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout)
+
+if __name__ == '__main__':
+    run_tests()
\ No newline at end of file
diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp
index 6f19ab4964668cc9cd63744c46c2541b469fe20c..43b393022690ea2be7ec24ec5361963bc569c8b3 100644
--- a/atvc/examples/reduce_sum/reduce_sum.cpp
+++ b/atvc/examples/reduce_sum/reduce_sum.cpp
@@ -16,7 +16,8 @@
 #include <iostream>
 #include <algorithm>
 #include "acl/acl.h"
-#include "atvc.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
 
 #define CHECK_ACL(x)                                                                        \
     do {                                                                                    \
@@ -37,11 +38,26 @@ bool IsClose(float a, float b)
     float diff = std::abs(a - b);
     return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
 }
-}
 
 // ReduceSum算子的描述：一个输入，一个输出，类型均为float
 using ReduceOpTraits =  ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>>;
 
+bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
+{
+    for (int32_t i = 0; i < golden.size(); i++) {
+            if (!IsClose(golden[i], output[i])) {
+                printf("Accuracy verification failed! The expected value of element "
+                       "in index [%d] is %f, but actual value is %f.\n",
+                    i,
+                    golden[i],
+                    output[i]);
+                return false;
+            }
+    }
+    return true;
+}
+}
+
 /*
  * 该函数为ReduceCustom算子核函数入口
  * x                 Device上的gm地址，指向Add算子第一个输入
@@ -155,7 +171,7 @@ int32_t main(int32_t argc, char* argv[])
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
     ATVC::ReduceParam param;    // Reduce运行态参数，包含TilingData以及临时空间的相关信息
-    ATVC::ReducePolicy policy;  // Reduce运行态参数，负责映射最适合的Reduce模板实现
+    ATVC::ReducePolicy policy = {-1, -1, -1};  // Reduce运行态参数，负责映射最适合的Reduce模板实现
     // Host侧调用Tiling API完成相关运行态参数的运算
     if (!ATVC::Host::CalcReduceTiling<ReduceOpTraits>(shape, dim, &policy, &param)) {
         printf("Reduce tiling error.\n");
@@ -178,14 +194,10 @@ int32_t main(int32_t argc, char* argv[])
     CHECK_ACL(aclrtResetDevice(deviceId));
     CHECK_ACL(aclFinalize());
 
-    for (int32_t i = 0; i < outEleNum; i++) {
-        if (!IsClose(golden[i], outputY[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                   i, golden[i], outputY[i]);
-          return -1;
-        }
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
     }
+
     printf("Accuracy verification passed.\n");
     return 0;
 }
diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp
index b339ccbbcc8e0d741ea84d195274fc8918521277..e8e277b76e7117137579b4ab6880453d182c1e7d 100644
--- a/atvc/examples/sinh_custom/sinh_custom.cpp
+++ b/atvc/examples/sinh_custom/sinh_custom.cpp
@@ -16,7 +16,8 @@
 #include <iostream>
 #include <algorithm>
 #include "acl/acl.h"
-#include "atvc.h"
+#include "elewise/elewise_host.h"
+#include "elewise/elewise_device.h" 
 
 #define CHECK_ACL(x)                                                                        \
     do {                                                                                    \
@@ -37,7 +38,6 @@ bool IsClose(float a, float b)
     float diff = std::abs(a - b);
     return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps));
 }
-}
 
 // 描述算子的输入输出以及临时计算资源
 using SinhOpTraits = ATVC::OpTraits<ATVC::OpInputs<float>, ATVC::OpOutputs<float>, ATVC::OpTemps<float, float>>;
@@ -64,6 +64,48 @@ struct SinhComputeFunc {
     }
 };
 
+void InitializeData(int32_t eleNum, std::vector<float> &inputX, std::vector<float> &golden)
+{
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0f, 10.0f);
+
+    for (int i = 0; i < eleNum; ++i) {
+        inputX[i] = dis(gen);
+        golden[i] = std::sinh(inputX[i]);
+    }
+}
+
+bool VerifyResults(const std::vector<float> &golden, const std::vector<float> &output)
+{
+    for (int32_t i = 0; i < golden.size(); i++) {
+        if (!IsClose(golden[i], output[i])) {
+            printf("Accuracy verification failed! The expected value of element "
+                   "in index [%d] is %f, but actual value is %f.\n",
+                i,
+                golden[i],
+                output[i]);
+            return false;
+        }
+    }
+    return true;
+}
+
+void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void CleanACL(aclrtStream &stream, int32_t deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+}
 
 /*
  * 该函数为SinhCustom算子核函数入口
@@ -85,19 +127,11 @@ int main()
     int32_t eleNum = 8 * 2048;
     size_t inputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
     size_t outputByteSize = static_cast<size_t>(eleNum) * sizeof(float);
-        
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> dis(1.0f, 10.0f);
 
     std::vector<float> inputX(eleNum);
     std::vector<float> golden(eleNum);
+    InitializeData(eleNum, inputX, golden);
 
-    for (int i = 0; i < eleNum; ++i) {
-        inputX[i] = (dis(gen));
-        golden[i] = std::sinh(inputX[i]);
-    }
-    printf("Generate golden data successfully.\n");
     ATVC::EleWiseParam param;
 
     // 计算输入为8*2048个float元素的sinh算子的运行态参数param
@@ -106,19 +140,15 @@ int main()
         return -1;
     };
     // 初始化Acl资源与数据
-    CHECK_ACL(aclInit(nullptr));
     aclrtContext context;
-    int32_t deviceId = 0;
-    CHECK_ACL(aclrtSetDevice(deviceId));
-    CHECK_ACL(aclrtCreateContext(&context, deviceId));
     aclrtStream stream = nullptr;
-    CHECK_ACL(aclrtCreateStream(&stream));
+    int32_t deviceId = 0;
+    InitializeACL(context, stream, deviceId);
 
     uint8_t *yHost;
     uint8_t *xDevice;
     uint8_t *yDevice;
     uint8_t *paramDevice;
-
     CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
     CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
 
@@ -145,17 +175,10 @@ int main()
     CHECK_ACL(aclrtFree(paramDevice));
     CHECK_ACL(aclrtFreeHost(yHost));
 
-    CHECK_ACL(aclrtDestroyStream(stream));
-    CHECK_ACL(aclrtResetDevice(deviceId));
-    CHECK_ACL(aclFinalize());
+    CleanACL(stream, deviceId);
 
-    for (int32_t i = 0; i < eleNum; i++) {
-        if (!IsClose(golden[i], outputY[i])) {
-            printf("Accuracy verification failed! The expected value of element "
-                   "in index [%d] is %f, but actual value is %f.\n",
-                   i, golden[i], outputY[i]);
-          return -1;
-        }
+    if (!VerifyResults(golden, outputY)) {
+        return -1;
     }
     printf("Accuracy verification passed.\n");
     return 0;
diff --git a/atvc/include/broadcast/broadcast_device.h b/atvc/include/broadcast/broadcast_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d1f7775f4b696098b159f7321c4f6fa3fd4cede
--- /dev/null
+++ b/atvc/include/broadcast/broadcast_device.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+
+/*!
+ * \file broadcast_device.h
+ * \brief
+ */
+
+#ifndef ATVC_BROADCAST_DEVICE_H
+#define ATVC_BROADCAST_DEVICE_H
+
+#include "common/atvc_opdef.h"
+#include "common/const_def.h"
+#include "common/kernel_utils.h"
+#include "common/ops_utils_device.h"
+#include "broadcast/common/broadcast_common.h"
+#include "broadcast/broadcast_compute.h"
+#include "broadcast/broadcast_op_template.h"
+#include "kernel_operator.h"
+
+#endif // ATVC_BROADCAST_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h
index f9d777f5199fd1677c95affa8db46631af470d38..99cdc7d9834829374883145bc66b71be552f6e2b 100644
--- a/atvc/include/broadcast/broadcast_op_template.h
+++ b/atvc/include/broadcast/broadcast_op_template.h
@@ -13,7 +13,9 @@
 #ifndef ATVC_BROADCAST_OP_TEMPLATE_H
 #define ATVC_BROADCAST_OP_TEMPLATE_H
 #include "common/const_def.h"
+#include "common/ops_utils_device.h"
 #include "broadcast/broadcast_utils/broadcast_buf_pool.h"
+#include "broadcast/broadcast_compute.h"
 namespace ATVC {
 struct BroadcastDataView {
     uint32_t dimASize;
@@ -30,7 +32,7 @@ struct BroadcastDataView {
 };
 
 namespace Kernel {
-template <class BroadcastCompute, const auto& SelectBroadcastPolicy>
+template <class BroadcastCompute, const auto& SelectBroadcastPolicy, typename ParamPtr = __gm__ ATVC::BroadcastParam*>
 class BroadcastOpTemplate {
 public:
     using DataType = typename BroadcastCompute::DataType;
@@ -41,26 +43,27 @@ public:
     @param dst: 输出数据的gm指针
     @broadcastParam: broadcast的动态参数，包含tiling data, workspace等
     */
-    __aicore__ inline void Run(GM_ADDR src, GM_ADDR dst, GM_ADDR broadcastParam)
+    template<typename T1>
+    __aicore__ inline void Run(GM_ADDR src, GM_ADDR dst, T1 broadcastParam)
     {
         this->Init(src, dst, broadcastParam);
         this->Process();
     }
 
 private:
-    __aicore__ inline void Init(GM_ADDR src, GM_ADDR dst, GM_ADDR broadcastParam)
+    template<typename T1>
+    __aicore__ inline void Init(GM_ADDR src, GM_ADDR dst, T1 broadcastParam)
     {
-        param_ = reinterpret_cast<__gm__ BroadcastParam*>(broadcastParam);
-        tilingData_ = &param_->tilingData;
-        uint32_t srcDataSize = tilingData_->basicBlock;
-        uint32_t dstDataSize = tilingData_->basicBlock;
+        param_ = reinterpret_cast<ParamPtr>(broadcastParam);
+        uint32_t srcDataSize = this->param_->tilingData.basicBlock;
+        uint32_t dstDataSize = this->param_->tilingData.basicBlock;
         srcGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ DataType*>(src), srcDataSize);
         dstGlobal_.SetGlobalBuffer(reinterpret_cast<__gm__ DataType*>(dst), dstDataSize);
         bufPool_.template Init<DataType>(GetTPipePtr(),
                                     ATVC::CONST2, // doublebuff需要的输入个数
                                     ATVC::CONST2, // 计算结果的个数，一般与inputNum保持一致
-                                    tilingData_->A2 * tilingData_->A12 * DATA_SIZE, // 输入Tensor大小
-                                    tilingData_->A2 * tilingData_->B2 * DATA_SIZE); // 输出Tensor大小
+                                    this->param_->tilingData.A2 * this->param_->tilingData.A12 * DATA_SIZE, // 输入Tensor大小
+                                    this->param_->tilingData.A2 * this->param_->tilingData.B2 * DATA_SIZE); // 输出Tensor大小
     }
 
     __aicore__ inline void CopyOutBatch(BroadcastDataView &view,
@@ -72,12 +75,12 @@ private:
         for (int i = 0; i < view.B1; i++) {
             uint32_t copyOutOffset;
             if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) {
-                copyOutOffset = dimBCount * view.dimASize + dimACount * tilingData_->A2;
+                copyOutOffset = dimBCount * view.dimASize + dimACount * this->param_->tilingData.A2;
             } else {
-                copyOutOffset = dimACount * tilingData_->A2 * view.dimBSize + dimBCount;
+                copyOutOffset = dimACount * this->param_->tilingData.A2 * view.dimBSize + dimBCount;
             }
             CopyOut(output, copyOutOffset + view.copyOutBaseOffset, view);
-            dimBCount += tilingData_->B2;
+            dimBCount += this->param_->tilingData.B2;
             AscendC::PipeBarrier<PIPE_MTE3>();
         }
     }
@@ -92,8 +95,8 @@ private:
         for (int i = 0; i < view.A11; i++) {
             inputOffset = 0;
             bufPool_.AllocTensor<true>(input);
-            uint32_t copyInOffset = i * view.A12 * tilingData_->A2;
-            if (tilingData_->A0 != 1) {
+            uint32_t copyInOffset = i * view.A12 * this->param_->tilingData.A2;
+            if (this->param_->tilingData.A0 != 1) {
                 copyInOffset += view.dimAOffset;
             }
             if (copyInOffset >= view.dimASize) {
@@ -102,7 +105,7 @@ private:
             if (copyInOffset + view.copyInSize > view.dimASize) {
                 // 剩下的数据不够一次完整计算， 根据实际数据重新计算
                 view.copyInSize = view.dimASize - copyInOffset;
-                view.A12 = OpsUtils::CeilDiv<uint32_t>(view.copyInSize, tilingData_->A2);
+                view.A12 = OpsUtils::CeilDiv<uint32_t>(view.copyInSize, this->param_->tilingData.A2);
             }
             CopyIn(input, copyInOffset, view);
             bufPool_.SetVecSync(input);
@@ -112,14 +115,14 @@ private:
                 bufPool_.AllocTensor<false>(output);
                 SyncDataQueue<AscendC::HardEvent::MTE2_V>();
                 compute_.template Compute<SelectBroadcastPolicy.patternID>(input, inputOffset, output,
-                    OpsUtils::CeilAlign<uint32_t>(tilingData_->A2, UB_ALIGN_COUNT),
-                    OpsUtils::CeilAlign<uint32_t>(tilingData_->B2, UB_ALIGN_COUNT));
+                    OpsUtils::CeilAlign<uint32_t>(this->param_->tilingData.A2, UB_ALIGN_COUNT),
+                    OpsUtils::CeilAlign<uint32_t>(this->param_->tilingData.B2, UB_ALIGN_COUNT));
                 bufPool_.SetCopyOutSync(output);
                 bufPool_.WaitCopyOutSync(output);
                 CopyOutBatch(view, dimACount, output);
                 bufPool_.FreeTensor<false>(output);
                 dimACount++;
-                inputOffset += tilingData_->A2;
+                inputOffset += this->param_->tilingData.A2;
                 SyncDataQueue<AscendC::HardEvent::MTE3_V>();
             }
             SyncDataQueue<AscendC::HardEvent::MTE3_MTE2>();
@@ -133,17 +136,17 @@ private:
         uint32_t copyOutBaseOffset = 0;
         // 计算拷出偏移基址
         if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) {
-            if (tilingData_->A0 != 1) { // 核间A切分， 取部分A
+            if (this->param_->tilingData.A0 != 1) { // 核间A切分， 取部分A
                 copyOutBaseOffset += view.dimAOffset;
             }
-            if (tilingData_->B0 != 1) { // 核间B切分，取部分B
+            if (this->param_->tilingData.B0 != 1) { // 核间B切分，取部分B
                 copyOutBaseOffset += view.dimBOffset * view.dimASize;
             }
         } else {
-            if (tilingData_->A0 != 1) { // 核间A切分， 取部分A
+            if (this->param_->tilingData.A0 != 1) { // 核间A切分， 取部分A
                 copyOutBaseOffset += view.dimAOffset * view.dimBSize;
             }
-            if (tilingData_->B0 != 1) { // 核间B切分，取部分B
+            if (this->param_->tilingData.B0 != 1) { // 核间B切分，取部分B
                 copyOutBaseOffset += view.dimBOffset;
             }
         }    
@@ -153,34 +156,34 @@ private:
     __aicore__ inline void CalcView(BroadcastDataView &view)
     {
         if (SelectBroadcastPolicy.patternID == AB_PATTERN::ABA) {
-            view.dimASize = tilingData_->dstShape[1];
-            view.dimBSize = tilingData_->dstShape[0];
+            view.dimASize = this->param_->tilingData.dstShape[1];
+            view.dimBSize = this->param_->tilingData.dstShape[0];
             view.inShape[0] = 1;
-            view.inShape[1] = tilingData_->A2;
-            view.outShape[0] = tilingData_->B2;
-            view.outShape[1] = tilingData_->A2;
+            view.inShape[1] = this->param_->tilingData.A2;
+            view.outShape[0] = this->param_->tilingData.B2;
+            view.outShape[1] = this->param_->tilingData.A2;
         } else {
-            view.dimASize = tilingData_->dstShape[0];
-            view.dimBSize = tilingData_->dstShape[1];
-            view.inShape[0] = tilingData_->A2;
+            view.dimASize = this->param_->tilingData.dstShape[0];
+            view.dimBSize = this->param_->tilingData.dstShape[1];
+            view.inShape[0] = this->param_->tilingData.A2;
             view.inShape[1] = 1;
-            view.outShape[0] = tilingData_->A2;
-            view.outShape[1] = tilingData_->B2;
+            view.outShape[0] = this->param_->tilingData.A2;
+            view.outShape[1] = this->param_->tilingData.B2;
         }        
-        view.A11 = tilingData_->A11;
-        view.A12 = tilingData_->A12;
-        view.B1 = tilingData_->B1;
+        view.A11 = this->param_->tilingData.A11;
+        view.A12 = this->param_->tilingData.A12;
+        view.B1 = this->param_->tilingData.B1;
         uint32_t blockId = AscendC::GetBlockIdx();
-        uint32_t dimAIdx = blockId / tilingData_->B0;
-        uint32_t dimBIdx = blockId % tilingData_->factorBTotalCnt;
-        view.dimAOffset = dimAIdx * tilingData_->factorACntPerCore;
-        view.dimBOffset = dimBIdx * tilingData_->factorBCntPerCore;
+        uint32_t dimAIdx = blockId / this->param_->tilingData.B0;
+        uint32_t dimBIdx = blockId % this->param_->tilingData.factorBTotalCnt;
+        view.dimAOffset = dimAIdx * this->param_->tilingData.factorACntPerCore;
+        view.dimBOffset = dimBIdx * this->param_->tilingData.factorBCntPerCore;
         // 计算一次计算的输入数据大小
-        view.copyInSize = view.A12 * tilingData_->A2; // 一次拷贝A12份数据， for循环计算A12次
-        if (view.dimAOffset + tilingData_->factorACntPerCore > view.dimASize) {
+        view.copyInSize = view.A12 * this->param_->tilingData.A2; // 一次拷贝A12份数据， for循环计算A12次
+        if (view.dimAOffset + this->param_->tilingData.factorACntPerCore > view.dimASize) {
             // 剩下的A维度的数据不够每个核分到的A数目，重新计算实际的A维度切分
             uint32_t realShape = view.dimASize - view.dimAOffset;
-            uint32_t A1 =  OpsUtils::CeilDiv<uint32_t>(realShape, tilingData_->A2);
+            uint32_t A1 =  OpsUtils::CeilDiv<uint32_t>(realShape, this->param_->tilingData.A2);
             if (A1 < view.A12) {
                 view.A11 = 1;
                 view.A12 = A1;
@@ -188,9 +191,9 @@ private:
                 view.A11 = OpsUtils::CeilDiv<uint32_t>(A1, view.A12);
             }
         }
-        if (view.dimBOffset + tilingData_->factorBCntPerCore > view.dimBSize) {
+        if (view.dimBOffset + this->param_->tilingData.factorBCntPerCore > view.dimBSize) {
             uint32_t realShape = view.dimBSize - view.dimBOffset;
-            view.B1 =  OpsUtils::CeilDiv<uint32_t>(realShape, tilingData_->B2);
+            view.B1 =  OpsUtils::CeilDiv<uint32_t>(realShape, this->param_->tilingData.B2);
         }
         view.copyOutBaseOffset = CalcCopyOutBaseOffset(view);
     }
@@ -211,7 +214,7 @@ private:
     {
         uint32_t blockId = AscendC::GetBlockIdx();
         uint32_t dstDataSize = view.outShape[0] * view.outShape[1];
-        uint64_t dstShape = tilingData_->dstShape[1];
+        uint64_t dstShape = this->param_->tilingData.dstShape[1];
         AscendC::DataCopyExtParams copyOutParams;
         copyOutParams.blockLen = view.outShape[1] * DATA_SIZE;
         copyOutParams.blockCount = dstDataSize * DATA_SIZE / copyOutParams.blockLen;
@@ -222,9 +225,9 @@ private:
             copyOutParams.blockLen = (dstShape - copyOutOffset % dstShape) * DATA_SIZE;
             copyOutParams.srcStride = (copyOutParams.srcStride - copyOutParams.blockLen) / ATVC::UB_ALIGN_32;
         }
-        if (view.outShape[0] + copyOutOffset / dstShape > tilingData_->dstShape[0]) {
+        if (view.outShape[0] + copyOutOffset / dstShape > this->param_->tilingData.dstShape[0]) {
             // 行非对齐， 按实际数据拷贝
-            copyOutParams.blockCount = (tilingData_->dstShape[0] - copyOutOffset / dstShape);
+            copyOutParams.blockCount = (this->param_->tilingData.dstShape[0] - copyOutOffset / dstShape);
         }
         copyOutParams.dstStride = dstShape * DATA_SIZE - copyOutParams.blockLen;
         AscendC::DataCopyPad(dstGlobal_[copyOutOffset], output, copyOutParams);
@@ -238,8 +241,7 @@ private:
     AscendC::GlobalTensor<DataType> srcGlobal_;
     AscendC::GlobalTensor<DataType> dstGlobal_;
     BroadcastCompute compute_;
-    const __gm__ BroadcastParam *param_;
-    const __gm__ BroadcastOpTilingData *tilingData_;
+    ParamPtr param_;
     KernelUtils::BroadcastBufPool bufPool_;
     constexpr static uint32_t DATA_SIZE = sizeof(DataType);
     constexpr static uint32_t UB_ALIGN_COUNT = ATVC::UB_ALIGN_32 / DATA_SIZE;
diff --git a/atvc/include/broadcast/common/broadcast_common.h b/atvc/include/broadcast/common/broadcast_common.h
index 3f51b4e808a3188cedd287fc36df9d6a9b8e3d9e..fcdd0c0ff194596374ea5f58ba196f364fba1105 100644
--- a/atvc/include/broadcast/common/broadcast_common.h
+++ b/atvc/include/broadcast/common/broadcast_common.h
@@ -32,6 +32,8 @@ public:
     int32_t patternID = -1;
     int32_t loopABCount = -1;
     int32_t loopInnerABCount = -1;
+    constexpr BroadcastPolicy(int patternID_, int loopABCount_, int loopInnerABCount_):
+    patternID(patternID_), loopABCount(loopABCount_),loopInnerABCount(loopInnerABCount_){}
     bool operator==(const BroadcastPolicy& rhs) const
     {
         return this->patternID == rhs.patternID && this->loopABCount == rhs.loopABCount &&\
diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h
index a054aaa3265a0b2958bde14c0f81ee5b3d138530..10dfc8fe7f8b333923e02407904fc145e099bc91 100644
--- a/atvc/include/broadcast/tiling/broadcast_tiling.h
+++ b/atvc/include/broadcast/tiling/broadcast_tiling.h
@@ -24,7 +24,7 @@
 #include "graph/types.h"
 #include "common/const_def.h"
 #include "common/compile_info.h"
-#include "common/ops_utils.h"
+#include "common/ops_utils_host.h"
 
 
 namespace ATVC {
@@ -32,6 +32,8 @@ struct BroadcastTilingInputParam {
     std::vector<int64_t> shapeIn;
     std::vector<int64_t> shapeOut;
     ge::DataType inputDtype = ge::DataType::DT_UNDEFINED;
+    BroadcastTilingInputParam(std::vector<int64_t> in,std::vector<int64_t> out,ge::DataType inDtype):
+    shapeIn(in),shapeOut(out),inputDtype(inDtype){}
 };
 }
 
@@ -150,7 +152,7 @@ private:
             if (oriShapeIn[i] == 1 && oriShapeOut[i] != oriShapeIn[i]) { // B轴
                 if (!isCurB && haveB) {
                     printf("[ERROR]Only support AB/BA!\n");
-                    return false; 
+                    return false;
                 }
                 if (!haveB) {
                     shapeIn.emplace_back(oriShapeIn[i]);
@@ -161,10 +163,10 @@ private:
                 }
                 isCurB = true;
                 haveB = true;
-            }else { // A轴
+            } else { // A轴
                 if (isCurB && haveA) {
                     printf("[ERROR]Only support AB/BA!\n");
-                    return false; 
+                    return false;
                 }
                 if (!haveA) {
                     shapeIn.emplace_back(oriShapeIn[i]);
@@ -351,7 +353,7 @@ private:
     ATVC::BroadcastTilingInputParam opInput_;
     ATVC::BroadcastParam* param_ {nullptr};
     ATVC::BroadcastPolicy* policy_ {nullptr};
-    ATVC::OpCompileInfo compileInfo_;
+    ATVC::OpCompileInfo compileInfo_ = {0, 0, 0, 0};
 };
 }  // namespace OpTiling
 #endif // ATVC_BROADCAST_TILING_H
\ No newline at end of file
diff --git a/atvc/include/common/compile_info.h b/atvc/include/common/compile_info.h
index dc5479a76fd92c8f306e8c95cd1230d770d48ff8..f92f68aa479d8b89ed24cbc78bc0f026c8cba9a7 100644
--- a/atvc/include/common/compile_info.h
+++ b/atvc/include/common/compile_info.h
@@ -20,6 +20,12 @@ struct OpCompileInfo {
     uint64_t ubSize = 0;
     uint64_t cacheLineSize = 0;
     uint64_t ubBlockSize = 0;
+    OpCompileInfo(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+        this->vectorCoreNum = a;
+        this->ubBlockSize = d;
+        this->ubSize = b;
+        this->cacheLineSize = c;
+    }
 };
 
 inline OpCompileInfo GetOpCompileInfo()
diff --git a/atvc/include/common/const_def.h b/atvc/include/common/const_def.h
index 4ae60fa0d435c34e82d8891cf6798056e9b6a130..df6eb2fbd1bfbc9cdda64347dd6f649f784bcee8 100644
--- a/atvc/include/common/const_def.h
+++ b/atvc/include/common/const_def.h
@@ -13,6 +13,7 @@
 
 #ifndef ATVC_COMMON_CONST_DEF_H
 #define ATVC_COMMON_CONST_DEF_H
+#include <cstdint>
 namespace ATVC {
     constexpr int32_t UB_ALIGN_32 = 32;
     constexpr int32_t UB_ALIGN_31 = 31;
diff --git a/atvc/include/common/ops_utils_device.h b/atvc/include/common/ops_utils_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..cacac400336b39340c30b91fca507f19c282500f
--- /dev/null
+++ b/atvc/include/common/ops_utils_device.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+/*!
+ * \file ops_utils_device.h
+ * \brief
+ */
+
+#ifndef ATVC_COMMON_OPS_UTILS_DEVICE_H
+#define ATVC_COMMON_OPS_UTILS_DEVICE_H
+
+#include "kernel_operator.h"
+namespace OpsUtils {
+template <typename T>
+__aicore__ inline T Ceil(T a, T b)
+{
+    if (b == 0) {
+        return a;
+    }
+    return (a + b - 1) / b;
+}
+
+template <typename T>
+__aicore__ inline T CeilAlign(T a, T b)
+{
+    if (b == 0) {
+        return a;
+    }
+    return (a + b - 1) / b * b;
+}
+
+template <typename T>
+__aicore__ inline T CeilDiv(T a, T b)
+{
+    if (b == 0) {
+        return a;
+    }
+    return (a + b - 1) / b;
+}
+
+template <typename T, typename U>
+__aicore__ inline T FloorDiv(T a, U b)
+{
+    if (b == 0) {
+        return a;
+    }
+    return a / b;
+}
+
+template <typename T>
+__aicore__ inline T Aligned(T value, T alignment)
+{
+    if (alignment == 0) {
+        return value;
+    }
+    return (value + alignment - 1) / alignment * alignment;
+}
+
+/**
+ * if align is 0, return 0
+ */
+template <typename T, typename U>
+__aicore__ inline typename std::enable_if <std::is_integral<T>::value, T>::type FloorAlign(T x, U align) {
+    return align == 0 ? 0 : x / align * align;
+}
+
+}
+
+#endif  // ATVC_COMMON_OPS_UTILS_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/common/ops_utils.h b/atvc/include/common/ops_utils_host.h
similarity index 71%
rename from atvc/include/common/ops_utils.h
rename to atvc/include/common/ops_utils_host.h
index efb33063196c3d9d098a42b8abd82e62dc103b9a..e92ff007f5f3a5b3c6d845df9a46f5c3b11885c3 100644
--- a/atvc/include/common/ops_utils.h
+++ b/atvc/include/common/ops_utils_host.h
@@ -10,29 +10,34 @@
  */
 
 /*!
- * \file ops_utils.h
+ * \file ops_utils_host.h
  * \brief
  */
 
-#ifndef ATVC_COMMON_OPS_UTILS_H
-#define ATVC_COMMON_OPS_UTILS_H
+#ifndef ATVC_COMMON_OPS_UTILS_HOST_H
+#define ATVC_COMMON_OPS_UTILS_HOST_H
 
-#include "kernel_operator.h"
 namespace OpsUtils {
 template <typename T>
-__host_aicore__ inline T Ceil(T a, T b)
+inline T Ceil(T a, T b)
 {
+    if (b == 0) {
+        return a;
+    }
     return (a + b - 1) / b;
 }
 
 template <typename T>
-__host_aicore__ inline T CeilAlign(T a, T b)
+inline T CeilAlign(T a, T b)
 {
+    if (b == 0) {
+        return a;
+    }
     return (a + b - 1) / b * b;
 }
 
 template <typename T>
-__host_aicore__ inline T CeilDiv(T a, T b)
+inline T CeilDiv(T a, T b)
 {
     if (b == 0) {
         return a;
@@ -41,7 +46,7 @@ __host_aicore__ inline T CeilDiv(T a, T b)
 }
 
 template <typename T, typename U>
-__host_aicore__ inline T FloorDiv(T a, U b)
+inline T FloorDiv(T a, U b)
 {
     if (b == 0) {
         return a;
@@ -50,7 +55,7 @@ __host_aicore__ inline T FloorDiv(T a, U b)
 }
 
 template <typename T>
-__host_aicore__ inline T Aligned(T value, T alignment)
+inline T Aligned(T value, T alignment)
 {
     if (alignment == 0) {
         return value;
@@ -62,10 +67,10 @@ __host_aicore__ inline T Aligned(T value, T alignment)
  * if align is 0, return 0
  */
 template <typename T, typename U>
-__host_aicore__ inline typename std::enable_if <std::is_integral<T>::value, T>::type FloorAlign(T x, U align) {
+inline typename std::enable_if <std::is_integral<T>::value, T>::type FloorAlign(T x, U align) {
     return align == 0 ? 0 : x / align * align;
 }
 
 }
 
-#endif  // ATVC_COMMON_OPS_UTILS_H
\ No newline at end of file
+#endif  // ATVC_COMMON_OPS_UTILS_HOST_H
\ No newline at end of file
diff --git a/atvc/include/elewise/elewise_device.h b/atvc/include/elewise/elewise_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..de15983a34b1e3dcb4d4bdd9044c11a4fd45c6a7
--- /dev/null
+++ b/atvc/include/elewise/elewise_device.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+
+/*!
+ * \file elewise_device.h
+ * \brief
+ */
+
+#ifndef ATVC_ELEWISE_DEVICE_H
+#define ATVC_ELEWISE_DEVICE_H
+
+#include "common/atvc_opdef.h"
+#include "common/const_def.h"
+#include "common/kernel_utils.h"
+#include "elewise/common/elewise_common.h"
+#include "elewise/elewise_op_template.h"
+#include "kernel_operator.h"
+
+#endif // ATVC_ELEWISE_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/elewise/elewise_host.h b/atvc/include/elewise/elewise_host.h
index cffb7f99ea4a7dd4f4fd9b04bea721ed6359c67b..3c64aaa90cc3fb2b25fdacf30254c2b4c69956ce 100644
--- a/atvc/include/elewise/elewise_host.h
+++ b/atvc/include/elewise/elewise_host.h
@@ -22,6 +22,7 @@
 #include "common/compile_info.h"
 #include "common/atvc_opdef.h"
 #include "elewise/common/elewise_common.h"
+
 namespace ATVC {
 namespace Host {
 namespace {
diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h
index ff1d05604181a42f1c725c740e84fd9f0eafeec9..2646597061718f5177ba87005b0dcfa40daf9dc4 100644
--- a/atvc/include/elewise/elewise_op_template.h
+++ b/atvc/include/elewise/elewise_op_template.h
@@ -23,7 +23,7 @@
 
 namespace ATVC {
 namespace Kernel {
-template <class EleWiseCompute>
+template <class EleWiseCompute, typename ParamPtr = __gm__ ATVC::EleWiseParam*>
 class EleWiseOpTemplate {
     using EleWiseOpTraits = typename GetFunctionTraits<EleWiseCompute>::ComputeTraits;
     using Inputs = typename EleWiseOpTraits::In::types;
@@ -64,7 +64,7 @@ private:
             FillOffsets<Outputs>(outOffsets_);
             FillOffsets<Temps>(tempOffsets_);
 
-            this->param_ = reinterpret_cast<__gm__ ATVC::EleWiseParam*>(t0);
+            this->param_ = reinterpret_cast<ParamPtr>(t0);
             uint32_t curBlockId = AscendC::GetBlockIdx();
 
             if (curBlockId < param_->tilingData.tailBlockCnt) {
@@ -328,7 +328,7 @@ private:
     std::size_t tempOffsets_[TempCount];
 
     // 计算得到的tiling数据
-    __gm__ EleWiseParam* param_;
+    ParamPtr param_;
 
     uint32_t curCoreCnt_;
     uint32_t curCoreStartCnt_;
diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h
index fca80f3fe2d28dc5a2dbdf9fc8877f3610c19758..309cc49ea42b4b1220a4d9cf05fa0c27a66aa7f8 100644
--- a/atvc/include/reduce/common/reduce_common.h
+++ b/atvc/include/reduce/common/reduce_common.h
@@ -76,16 +76,27 @@ namespace AR_COUNT {
     static constexpr uint32_t A5R9 = 59;
 };
 
+constexpr int32_t poly1 = 1000000;
+constexpr int32_t poly2 = 1000;
+
 struct ReducePolicy {
 public:
     int32_t patternID = -1;
     int32_t loopARCount = -1;
     int32_t loopInnerARCount = -1;
+    int32_t ID = -1;
+    constexpr ReducePolicy(int32_t patternID_, int32_t loopARCount_, int32_t loopInnerARCount_)
+        : patternID(patternID_), loopARCount(loopARCount_), loopInnerARCount(loopInnerARCount_), 
+            ID(poly1 * patternID_ + poly2 * loopARCount_ + loopInnerARCount_)
+    {}
     bool operator==(const ReducePolicy& rhs) const
     {
         return this->patternID == rhs.patternID && this->loopARCount == rhs.loopARCount &&\
         this->loopInnerARCount == rhs.loopInnerARCount;
     }
+    int32_t getID() {
+        return poly1 * this->patternID + poly2 * this->loopARCount + this->loopInnerARCount;
+    }
 };
 
 static constexpr ReducePolicy REDUCE_POLICY0 { AR_PATTERN::A, AR_COUNT::A1R0, 0 };
@@ -134,6 +145,7 @@ struct ReduceParam {
     uint32_t workspaceSize = 0;     // 申请空间大小
     ReduceTilingData tilingData;    // Reduce类算子的tiling数据
     int32_t nBufferNum = 2;         // 每个Queue中的Tensor数量
+    int32_t policyId = -1;
 };
 
 struct ReduceSchLoopInfo {
diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b862d04fdc9ac31362cf00332324e0f2a1721f5
--- /dev/null
+++ b/atvc/include/reduce/reduce_device.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+
+/*!
+ * \file reduce_device.h
+ * \brief
+ */
+
+#ifndef ATVC_REDUCE_DEVICE_H
+#define ATVC_REDUCE_DEVICE_H
+
+#include "common/atvc_opdef.h"
+#include "common/const_def.h"
+#include "reduce/common/reduce_common.h"
+#include "kernel_operator.h"
+#ifndef __ASCC_HOST__
+#include "common/kernel_utils.h"
+#include "reduce/reduce_sum.h"
+#include "reduce/reduce_op_template.h"
+#endif
+
+#endif // ATVC_REDUCE_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h
index 479a98d9aaea6e8f461b323acbf8cc4edd59754e..7834a895aa671da6261ce9b84b718350356632a2 100644
--- a/atvc/include/reduce/reduce_host.h
+++ b/atvc/include/reduce/reduce_host.h
@@ -12,7 +12,10 @@
 #define ATVC_REDUCE_HOST_H
 #include "common/atvc_opdef.h"
 #include "common/dtype_utils.h"
+#include "common/const_def.h"
+#include "reduce/common/reduce_common.h"
 #include "reduce/tiling/reduce_tiling.h"
+#include "reduce/tiling/tiling_common.h"
 
 namespace ATVC {
 namespace Host {
@@ -77,7 +80,7 @@ bool CalcReduceTiling(std::vector<int64_t> inputShape,
         printf("[ERROR] Reduce template does not support this data type!\n");
         return false;
     }
-    ReduceTilingInputParam opInput = {reduceDim, inputShape, inputDtype, GetPromoteDataType(inputDtype)};
+    OpTiling::ReduceTilingInputParam opInput = {reduceDim, inputShape, inputDtype, GetPromoteDataType(inputDtype)};
     OpTiling::ReduceOpTiling tiling(opInput, policy, param);
     if (tiling.Run() != 0) {
         printf("[ERROR] Tiling Error\n");
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index 17c747e6df7d218e8e8f659f62e76746aae3f870..7bdbb0600904bdaca6e31b4e3178b286251a8ff9 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -25,13 +25,14 @@
 #include "reduce/reduce_utils/reduce_block_aux.h"
 #include "reduce/reduce_utils/reduce_util.h"
 #include "reduce/reduce_utils/reduce_buf_pool.h"
+#include "reduce/common/reduce_common.h"
 
 namespace ATVC {
 namespace Kernel {
 template <class ReduceCompute, const auto& SelectReducePolicy, 
-            class PreCompute = void, class PostCompute = void>
-class ReduceOpTemplate
-{
+            class PreCompute = void, class PostCompute = void,
+            typename ParamPtr = __gm__ ATVC::ReduceParam*>
+class ReduceOpTemplate {
 public:
     constexpr static ReduceSchLoopInfo SchLoopInfo = KernelUtils::Reduce::GetSchLoopInfo<SelectReducePolicy>();
     using Pattern = typename ReducePattern::GetPattern<SchLoopInfo.patternID>::T;
@@ -55,39 +56,15 @@ public:
     
     // 按照输入、输出、ReduceParam、其他标量的顺序传入
     // 内部根据ReduceParam进行数据调度并调用ReduceOpTemplate完成计算后搬出到GM
-    template<typename ...Args>
-    __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, GM_ADDR param) {
-        param_ = reinterpret_cast<__gm__ ReduceParam*>(param);
+    template<typename ...Args, typename T1>
+    __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, T1 param) {
+        param_ = reinterpret_cast<ParamPtr>(param);  
         // 完成一些编译期的检查，比如PreCompute和PostCompute的In、Out个数是否与args的个数匹配
-        tiling_ = &param_->tilingData;
-
         Init((GM_ADDR)(param_->workspaceAddr), x, y);
-        Process();
-    }
-protected:
-    template <class... Args>
-    __aicore__ inline void InitArgsWorkspace(GM_ADDR workspace, Args... args)
-    {
-        workspace_.SetGlobalBuffer((__gm__ PromoteDataType*)workspace);
-    }
-
-    template <int32_t start, class... Args>
-    __aicore__ inline void InitArgsOutput(GM_ADDR y, Args... args)
-    {
-        output_[start].SetGlobalBuffer((__gm__ DataType*)y);
-        if constexpr (start + 1 < OutputSize) {
-            InitArgsOutput<start + 1>(args...);
-        }
-    }
-
-    template <int32_t start, class... Args>
-    __aicore__ inline void InitArgsInput(GM_ADDR x, Args... args)
-    {
-        input_[start].SetGlobalBuffer((__gm__ DataType*)x);
-        if constexpr (start + 1 < InputSize) {
-            InitArgsInput<start + 1>(args...);
-        } else {
-            InitArgsOutput<0>(args...);
+        if constexpr (AscendC::IsSameType<T1, GM_ADDR>::value) {
+            Process<__gm__ ATVC::ReduceTilingData>();
+        } else if constexpr (AscendC::IsSameType<T1, ATVC::ReduceParam*>::value) {
+            Process<ATVC::ReduceTilingData>();
         }
     }
 
@@ -96,8 +73,8 @@ public:
     __aicore__ inline void Init(GM_ADDR workspace, Args... args)
     {
         pipe_ = GetTPipePtr();
-        basicBlockLen_ = tiling_->basicBlock;
-        bufPool_.template Init<DataType, PromoteDataType>(pipe_, TBufSize, PromoteBufSize, tiling_->basicBlock);
+        basicBlockLen_ = this->param_->tilingData.basicBlock;
+        bufPool_.template Init<DataType, PromoteDataType>(pipe_, TBufSize, PromoteBufSize, this->param_->tilingData.basicBlock);
 
         InitArgsInput<0>(args...);
         InitArgsWorkspace(workspace);
@@ -123,7 +100,7 @@ public:
         bufPool_.FreeTensor(tensor);
     }
 
-    template <class... Args>
+    template <class T1, class... Args>
     __aicore__ inline void Process(Args... args)
     {
         if constexpr (SelectReducePolicy.patternID == ATVC::AR_PATTERN::A) {
@@ -131,16 +108,20 @@ public:
             return;
         }
         if constexpr (SchLoopInfo.loopRCount == 0) {
-            using SchTypeA = KernelUtils::Reduce::ReduceBlockAux<__gm__ ATVC::ReduceTilingData, &SchLoopInfo, std::remove_reference_t<decltype(*this)>, DataType,
-                                            DataType, PreCompute, PostCompute>;
+            using SchTypeA = KernelUtils::Reduce::ReduceBlockAux<
+                T1, &SchLoopInfo,
+                std::remove_reference_t<decltype(*this)>, DataType, DataType,
+                PreCompute, PostCompute>;
 
-            SchTypeA op(this, input_, output_, tiling_);
+            SchTypeA op(this, input_, output_, &(this->param_)->tilingData);
             op.Process(args...);
         } else {
             // 完成第一阶段的Reduce
-            using SchTypeR = KernelUtils::Reduce::ReduceBlockAux<__gm__ ATVC::ReduceTilingData, &SchLoopInfo, std::remove_reference_t<decltype(*this)>, DataType,
-                                            PromoteDataType, PreCompute, void>;
-            SchTypeR op(this, input_, &workspace_, tiling_);
+            using SchTypeR = KernelUtils::Reduce::ReduceBlockAux<
+                T1, &SchLoopInfo,
+                std::remove_reference_t<decltype(*this)>, DataType,
+                PromoteDataType, PreCompute, void>;
+            SchTypeR op(this, input_, &workspace_, &(this->param_)->tilingData);
             op.Process(args...);
             bufPool_.ResetEvent();
 
@@ -152,8 +133,10 @@ public:
             constexpr static ReduceSchLoopInfo groupSchLoopInfo = KernelUtils::Reduce::GetGroupSchLoopInfo();
             ATVC::ReduceTilingData groupTiling;
             SetGroupTiling(groupTiling);
-            using SchTypeA = KernelUtils::Reduce::ReduceBlockAux<ATVC::ReduceTilingData, &groupSchLoopInfo, std::remove_reference_t<decltype(*this)>,
-                                            PromoteDataType, DataType, void, PostCompute>;
+            using SchTypeA = KernelUtils::Reduce::ReduceBlockAux<
+                ATVC::ReduceTilingData, &groupSchLoopInfo,
+                std::remove_reference_t<decltype(*this)>, PromoteDataType,
+                DataType, void, PostCompute>;
             SchTypeA groupOp(this, &workspace_, output_, &groupTiling);
             groupOp.Process(args...);
         }
@@ -162,24 +145,26 @@ public:
     __aicore__ inline void SetGroupTiling(ATVC::ReduceTilingData& groupTiling)
     {
         groupTiling.ubFactorA = ELEMENT_ONE_REPEAT_COMPUTE;
-        groupTiling.ubFactorR = tiling_->groupR;
-        groupTiling.shape[0] = tiling_->groupR;
-        groupTiling.shape[1] = tiling_->outSize;
-        groupTiling.stride[0] = tiling_->outSize;
+        groupTiling.ubFactorR = this->param_->tilingData.groupR;
+        groupTiling.shape[0] = this->param_->tilingData.groupR;
+        groupTiling.shape[1] = this->param_->tilingData.outSize;
+        groupTiling.stride[0] = this->param_->tilingData.outSize;
         groupTiling.stride[1] = 1;
-        groupTiling.dstStride[0] = tiling_->outSize;
+        groupTiling.dstStride[0] = this->param_->tilingData.outSize;
         groupTiling.dstStride[1] = 1;
         groupTiling.groupR = 1;
-        groupTiling.outSize = tiling_->outSize;
+        groupTiling.outSize = this->param_->tilingData.outSize;
         groupTiling.factorRCntPerCore = 1;
         groupTiling.factorRTotalCnt = 1;
         groupTiling.factorATotalCnt = OpsUtils::CeilDiv(groupTiling.shape[1], groupTiling.ubFactorA);
         groupTiling.factorACntPerCore = OpsUtils::CeilDiv(groupTiling.factorATotalCnt,
-                                                     static_cast<uint64_t>(64));  // 按照64核计算，需要tiling传
+                                                          static_cast<uint64_t>(64));  // 按照64核计算，需要tiling传
     }
 
     template <bool isPadding, class T, class U, class V>
-    __aicore__ inline void CopyInAux(const AscendC::GlobalTensor<T>& src, U& view, V& shape, const AscendC::LocalTensor<T>& ubTensor)
+    __aicore__ inline void CopyInAux(const AscendC::GlobalTensor<T> &src,
+                                     U &view, V &shape,
+                                     const AscendC::LocalTensor<T> &ubTensor)
     {
         T paddingValue = compute_.template GetPaddingValue<T>();
         uint8_t padCnt = ((view.axis[0].dstStride - view.burstLen) * sizeof(T)) % BLOCK_SIZE_BYTE / sizeof(T);
@@ -197,34 +182,45 @@ public:
             (view.axis[0].dstStride - view.burstLen) * sizeof(T) / BLOCK_SIZE_BYTE;  // unit block(32byte)
         bufPool_.SyncTensor(ubTensor);
 
-        for (int32_t i = 0; i < view.axis[CONST6].repeat; i++) {
-            for (int32_t j = 0; j < view.axis[CONST5].repeat; j++) {
-                for (int32_t k = 0; k < view.axis[CONST4].repeat; k++) {
-                    for (int32_t l = 0; l < view.axis[CONST3].repeat; l++) {
-                        for (int32_t m = 0; m < view.axis[CONST2].repeat; m++) {
-                            for (int32_t n = 0; n < view.axis[CONST1].repeat; n++) {
-                                int64_t dstStride = i * view.axis[CONST6].dstStride + j * view.axis[CONST5].dstStride +
-                                            k * view.axis[CONST4].dstStride + l * view.axis[CONST3].dstStride + 
-                                            m * view.axis[CONST2].dstStride + n * view.axis[CONST1].dstStride;
-                                int64_t srcStride = i * view.axis[CONST6].srcStride + j * view.axis[CONST5].srcStride +
-                                            k * view.axis[CONST4].srcStride + l * view.axis[CONST3].srcStride + 
-                                            m * view.axis[CONST2].srcStride + n * view.axis[CONST1].srcStride;
-                                AscendC::DataCopyPad(ubTensor[dstStride], src[view.addr + srcStride], copyInParams, padParams);
-                            }
-                        }
-                    }
-                }
+        const int32_t repeats[CONST6] = {static_cast<int32_t>(view.axis[CONST1].repeat),
+            static_cast<int32_t>(view.axis[CONST2].repeat), static_cast<int32_t>(view.axis[CONST3].repeat),
+            static_cast<int32_t>(view.axis[CONST4].repeat), static_cast<int32_t>(view.axis[CONST5].repeat),
+            static_cast<int32_t>(view.axis[CONST6].repeat)};
+        const int32_t dstStrides[CONST6] = {static_cast<int32_t>(view.axis[CONST1].dstStride),
+            static_cast<int32_t>(view.axis[CONST2].dstStride), static_cast<int32_t>(view.axis[CONST3].dstStride),
+            static_cast<int32_t>(view.axis[CONST4].dstStride), static_cast<int32_t>(view.axis[CONST5].dstStride),
+            static_cast<int32_t>(view.axis[CONST6].dstStride)};
+        const int32_t srcStrides[CONST6] = {static_cast<int32_t>(view.axis[CONST1].srcStride),
+            static_cast<int32_t>(view.axis[CONST2].srcStride), static_cast<int32_t>(view.axis[CONST3].srcStride),
+            static_cast<int32_t>(view.axis[CONST4].srcStride), static_cast<int32_t>(view.axis[CONST5].srcStride),
+            static_cast<int32_t>(view.axis[CONST6].srcStride)};
+
+        int32_t total = 1;
+        for (int32_t i = 0; i < CONST6; ++i)
+            total *= repeats[i];
+
+        for (int32_t idx = 0; idx < total; ++idx) {
+            int32_t tmp = idx;
+            int32_t dstOffset = 0;
+            int32_t srcOffset = 0;
+            for (int32_t axis = 0; axis < CONST6; ++axis) {
+                int32_t coord = tmp % repeats[axis];
+                tmp /= repeats[axis];
+                dstOffset += coord * dstStrides[axis];
+                srcOffset += coord * srcStrides[axis];
             }
+            AscendC::DataCopyPad(ubTensor[dstOffset], src[view.addr + srcOffset], copyInParams, padParams);
         }
     }
 
-    __aicore__ inline void CopyInput2Output() {
+    __aicore__ inline void CopyInput2Output()
+    {
         uint32_t shapeSize = 1;
         for (uint8_t i = 0; i < MAX_DIM; i++) {
-            if (tiling_->shape[i] <= 1) {
+            if (this->param_->tilingData.shape[i] <= 1) {
                 break;
             }
-            shapeSize = shapeSize * tiling_->shape[i];
+            shapeSize = shapeSize * this->param_->tilingData.shape[i];
         }
         shapeSize = (shapeSize * sizeof(DataType) + UB_ALIGN_31) / UB_ALIGN_32 * UB_ALIGN_32 / sizeof(DataType);
         AscendC::LocalTensor<DataType> tmpLoc;
@@ -287,8 +283,36 @@ public:
         AscendC::DataCopyPad(dst[view.addr], tempBuf_[tmpBufOffest], copyOutParams);
         AscendC::PipeBarrier<PIPE_MTE3>();
     }
+
+protected:
+    template <class... Args>
+    __aicore__ inline void InitArgsWorkspace(GM_ADDR workspace, Args... args)
+    {
+        workspace_.SetGlobalBuffer((__gm__ PromoteDataType*)workspace);
+    }
+
+    template <int32_t start, class... Args>
+    __aicore__ inline void InitArgsOutput(GM_ADDR y, Args... args)
+    {
+        output_[start].SetGlobalBuffer((__gm__ DataType*)y);
+        if constexpr (start + 1 < OutputSize) {
+            InitArgsOutput<start + 1>(args...);
+        }
+    }
+
+    template <int32_t start, class... Args>
+    __aicore__ inline void InitArgsInput(GM_ADDR x, Args... args)
+    {
+        input_[start].SetGlobalBuffer((__gm__ DataType*)x);
+        if constexpr (start + 1 < InputSize) {
+            InitArgsInput<start + 1>(args...);
+        } else {
+            InitArgsOutput<0>(args...);
+        }
+    }
+
 private:
-    __gm__ ReduceParam* param_;   // CalcReduceTiling API计算出的运行态参数
+    ParamPtr param_;   // CalcReduceTiling API计算出的运行态参数
     AscendC::TPipe* pipe_;
     AscendC::TBuf<> oriVecQue_;
     AscendC::TBuf<> tempResQue_;
@@ -302,10 +326,9 @@ private:
     AscendC::GlobalTensor<DataType> output_[OutputSize];
     AscendC::GlobalTensor<DataType> input_[InputSize];
     AscendC::GlobalTensor<PromoteDataType> workspace_;
-
-    AscendC::LocalTensor<PromoteDataType> tempUb_;
-    const __gm__ ATVC::ReduceTilingData* tiling_;
-
+ 
+    AscendC::LocalTensor<PromoteDataType> tempUb_;    
+ 
     int64_t basicBlockLen_;
     int64_t oriBasicBlockLen_;
 
diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h
index e5d36a877918a470febf610229712e74ffb24542..d806251313cf876602993d4335c76b212cab62e9 100644
--- a/atvc/include/reduce/reduce_sum.h
+++ b/atvc/include/reduce/reduce_sum.h
@@ -16,6 +16,20 @@
 #include "reduce/common/patterns.h"
 #include "reduce/reduce_utils/reduce_block_aux_util.h"
 
+namespace {
+struct ReduceARParam {
+    uint32_t repStride = 0;
+    uint16_t dimA = 0;
+    uint16_t dimMax = 0;
+    uint16_t mainR = 0;
+    uint16_t tailR = 0;
+    uint64_t maskAddRNum = 0;
+    uint16_t loopRNum = 0;
+    uint16_t dtypeSize = 0;
+    uint16_t dimR = 0;
+};
+}
+
 
 namespace ATVC {
 // OpTraits: 算子描述的ATVC::OpTraits结构体
@@ -30,17 +44,22 @@ public:
     __aicore__ inline ReduceSumCompute() {}
 
     template <bool needMask, class Pattern>
-    __aicore__ inline void Compute(KernelUtils::Shape<2>& shape, const AscendC::LocalTensor<PrompteDtype>& dst, const AscendC::LocalTensor<PrompteDtype>& src)
+    __aicore__ inline void
+    Compute(KernelUtils::Shape<2> &shape,
+            const AscendC::LocalTensor<PrompteDtype> &dst,
+            const AscendC::LocalTensor<PrompteDtype> &src)
     {
         // AR场景，硬件限制，R轴需要做UB上32B对齐，对齐方式有2种：
         // 1. 高性能对齐(补充元素值不确定), 后续累加计算只能计算实际有效的元素个数
         // 2. 补0对齐(补值是由用户实现的GetPaddingValue()接口决定的）
         if (std::is_same<Pattern, ReducePattern::AR>::value) {
             if constexpr (needMask) { // 1. 高性能对齐模式
-                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); // MainR(int64_t dimR, bool isAR)： 框架提供的计算R轴二分长度(元素个数)， dimR为原始的元素个数
+                // MainR(int64_t dimR, bool isAR)： 框架提供的计算R轴二分长度(元素个数)， dimR为原始的元素个数
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
                 ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.oriBurstLen);
             } else {
-                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); // MainR：框架提供的计算R轴二分长度(元素个数)，dimR为补齐后的元素个数
+                // MainR：框架提供的计算R轴二分长度(元素个数)，dimR为补齐后的元素个数
+                int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
                 ReduceAR(dst, src, shape.value[0], shape.value[1],  mainR, shape.value[1]);
             }
         }
@@ -50,20 +69,24 @@ public:
         }
     }
 
-    __aicore__ inline void ReduceRA(const AscendC::LocalTensor<PrompteDtype>& dst, const AscendC::LocalTensor<PrompteDtype>& src, uint16_t dimA, uint16_t dimR,
-                                    uint16_t mainR)
+    __aicore__ inline void
+    ReduceRA(const AscendC::LocalTensor<PrompteDtype> &dst,
+             const AscendC::LocalTensor<PrompteDtype> &src, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR)
     {
         uint32_t totalNum = dimR * dimA;
         uint32_t mainNum = dimA * mainR;
         uint32_t dtypeSize = sizeof(PrompteDtype);
         uint32_t tailNum = totalNum - mainNum;
-        uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; // add mask最大值为256 bytes 且要满足32bytes对齐
+        // add mask最大值为256 bytes 且要满足32bytes对齐
+        uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
         // 处理tail
         uint16_t repeatTimes = tailNum / maskAddNum;
         uint16_t repeatNum = repeatTimes * maskAddNum;
         uint16_t repTailNum = tailNum - repeatNum;
         uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长
-        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
         if (repeatTimes > 0) {
             AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
         }
@@ -77,9 +100,9 @@ public:
         AscendC::PipeBarrier<PIPE_V>();
         // 二分主体
         uint16_t loopRNum = mainR;
-        while(loopRNum > 1) {
+        while (loopRNum > 1) {
             loopRNum = loopRNum >> 1;
-            mainNum = loopRNum * dimA;// LoopR的前半部分数据量
+            mainNum = loopRNum * dimA; // LoopR的前半部分数据量
             repeatTimes = mainNum / maskAddNum;
             repeatNum = repeatTimes * maskAddNum;
             repTailNum = mainNum - repeatNum;
@@ -102,8 +125,10 @@ public:
         AscendC::DataCopy(dst, src, dimA);
     }
 
-    __aicore__ inline void ReduceAR(const AscendC::LocalTensor<PrompteDtype>& dstTensor, const AscendC::LocalTensor<PrompteDtype>& srcTensor, uint16_t dimA, uint16_t dimR,
-                                    uint16_t mainR, uint64_t oriBurstLen)
+    __aicore__ inline void
+    ReduceAR(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+             const AscendC::LocalTensor<PrompteDtype> &srcTensor, uint16_t dimA,
+             uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
     {
         uint16_t tailR = oriBurstLen - mainR;
         uint16_t dtypeSize = sizeof(PrompteDtype);
@@ -111,74 +136,31 @@ public:
         uint16_t dimMax = dimA * dimR;
         uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
 
+        ReduceARParam param{
+            .repStride   = repStride,
+            .dimA        = dimA,
+            .dimMax      = dimMax,
+            .mainR       = mainR,
+            .tailR       = tailR,
+            .maskAddRNum = maskAddRNum,
+            .dtypeSize   = dtypeSize,
+            .dimR        = dimR
+        };
+
         if (mainR > 0 && tailR > 0) {
-            uint16_t addRTotalNum = tailR / maskAddRNum * maskAddRNum;
-            uint16_t addRTail = tailR - addRTotalNum;
-            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
-            if (repStride > UB_ALIGN_255) {
-                for (uint16_t i = 0; i < dimMax; i += dimR) {
-                    AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + mainR], tailR);
-                }
-            } else {
-                for (uint16_t i = 0; i < addRTotalNum; i += maskAddRNum) {
-                    AscendC::Add(srcTensor[i],  srcTensor[i + mainR], srcTensor[i], maskAddRNum, dimA, repeatParams);
-                }
-                if (addRTail > 0) {
-                    AscendC::Add(srcTensor[addRTotalNum],  srcTensor[addRTotalNum + mainR], srcTensor[addRTotalNum], addRTail, dimA, repeatParams);
-                }
-            }
-            AscendC::PipeBarrier<PIPE_V>();
+            PerformInitialAdd(srcTensor, param);
         }
 
         // 二分计算
-        uint16_t loopRNum = mainR;
-        while(loopRNum > maskAddRNum) {
-            loopRNum = loopRNum / 2; // 除2二分
-            if (repStride > UB_ALIGN_255) {
-                for (uint16_t i = 0; i < dimMax; i += dimR) {
-                    AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + loopRNum], loopRNum);
-                }
-            } else {
-                uint16_t addRTotalNum = loopRNum / maskAddRNum * maskAddRNum;
-                uint16_t addRTail = loopRNum -addRTotalNum;
-                AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
-                for (uint16_t i = 0; i < addRTotalNum; i += maskAddRNum) {
-                    AscendC::Add(srcTensor[i], srcTensor[i + loopRNum], srcTensor[i], maskAddRNum, dimA, repeatParams);
-                }
-                if (addRTail > 0) {
-                    AscendC::Add(srcTensor[addRTotalNum], srcTensor[addRTotalNum], srcTensor[addRTotalNum + loopRNum], addRTail, dimA, repeatParams);
-                }
-            }
-            AscendC::PipeBarrier<PIPE_V>();
-        }
-        if (loopRNum == 0) { // small shape, 直接reduce
-            loopRNum = tailR;
+        param.loopRNum = mainR;
+        while (param.loopRNum > maskAddRNum) {
+            param.loopRNum = param.loopRNum / 2; // 除2二分
+            PerformBinaryReduction(srcTensor, param);
         }
-        if constexpr ( AscendC::IsSameType<PrompteDtype, float>::value || AscendC::IsSameType<PrompteDtype, half>::value) {
-            uint16_t reduceLoopTimes = UB_ALIGN_255 * dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / dtypeSize;
-            for (uint16_t dimAIdx = 0; dimAIdx < dimA; dimAIdx += reduceLoopTimes) { // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
-                uint16_t curDimA = (dimAIdx + reduceLoopTimes < dimA) ? reduceLoopTimes : dimA - dimAIdx;
-                AscendC::WholeReduceSum(dstTensor[dimAIdx], srcTensor[dimAIdx * dimR], loopRNum, curDimA, 1, 1, repStride);
-            }
-            AscendC::PipeBarrier<PIPE_V>();
-        } else if constexpr ( AscendC::IsSameType<PrompteDtype, int32_t>::value || AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
-            maskAddRNum = UB_ALIGN_32 / dtypeSize;
-            // 尽量二分add到最后32bytes
-            // int32 -> float 都是4字,一把cast 用CAST_NONE
-            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
-            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
-            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, dimA *dimR);
-            AscendC::PipeBarrier<PIPE_V>();
-             uint16_t reduceLoopTimes = 255 * dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / dtypeSize;
-            for (uint16_t dimAIdx = 0; dimAIdx < dimA; dimAIdx += reduceLoopTimes) { // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
-                uint16_t curDimA = (dimAIdx + reduceLoopTimes < dimA) ? reduceLoopTimes : dimA - dimAIdx;
-                AscendC::WholeReduceSum(interpreDst[dimAIdx], interpreSrc[dimAIdx * dimR], loopRNum, curDimA, 1, 1, repStride);
-            }
-            AscendC::PipeBarrier<PIPE_V>();
-            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
-        } else {
-            return;
+        if (param.loopRNum == 0) { // small shape, 直接reduce
+            param.loopRNum = tailR;
         }
+        PerformFinalReduction(dstTensor, srcTensor, param);
     }
 
     template <class Pattern, class V>
@@ -202,15 +184,19 @@ public:
         for (uint16_t i = 0; i < outerLoopTimes; ++i) {  // outerLoopTimes是dimA的大小
             uint32_t srcIdx = i * outerLoopStride;
             for (uint16_t j = 0; j < innerLoopTimes; ++j) {
-                AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx], dstTensor[srcIdx + j * innerLoopStride], outerLoopStride);
+                AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx],
+                             dstTensor[srcIdx + j * innerLoopStride],
+                             outerLoopStride);
                 AscendC::PipeBarrier<PIPE_V>();
             }
             DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
         }
     }
 
-    __aicore__ inline void ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype>& ubTensorLeft,
-                                           const AscendC::LocalTensor<PrompteDtype>& ubTensorRight, const int32_t& calCount)
+    __aicore__ inline void
+    ReduceBetweenUB(const AscendC::LocalTensor<PrompteDtype> &ubTensorLeft,
+                    const AscendC::LocalTensor<PrompteDtype> &ubTensorRight,
+                    const int32_t &calCount)
     {
         Add(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
     }
@@ -221,6 +207,95 @@ public:
         U paddingValue = 0; // 由于ReduceSum是累加R轴数据，补齐的元素值设为0，才能保证累加的结果不受影响
         return paddingValue;
     }
+
+private:
+    __aicore__ inline void PerformInitialAdd(const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+        uint16_t addRTail = param.tailR - addRTotalNum;
+        // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+        AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+                AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+            }
+        } else {
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Add(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Add(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.mainR],
+                    srcTensor[addRTotalNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor<PrompteDtype> &srcTensor,
+        const ReduceARParam& param)
+    {
+        if (param.repStride > UB_ALIGN_255) {
+            for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+                AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+            }
+        } else {
+            uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+            uint16_t addRTail = param.loopRNum - addRTotalNum;
+            // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+            AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+            for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+                AscendC::Add(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+            }
+            if (addRTail > 0) {
+                AscendC::Add(srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum],
+                    srcTensor[addRTotalNum + param.loopRNum],
+                    addRTail,
+                    param.dimA,
+                    repeatParams);
+            }
+        }
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+
+    __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor<PrompteDtype> &dstTensor,
+        const AscendC::LocalTensor<PrompteDtype> &srcTensor, const ReduceARParam& param)
+    {
+        if constexpr (AscendC::IsSameType<PrompteDtype, float>::value ||
+                      AscendC::IsSameType<PrompteDtype, half>::value) {
+            uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceSum(
+                    dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+        } else if constexpr (AscendC::IsSameType<PrompteDtype, int32_t>::value ||
+                             AscendC::IsSameType<PrompteDtype, uint32_t>::value) {
+            // 尽量二分add到最后32bytes
+            // int32 -> float 都是4字,一把cast 用CAST_NONE
+            AscendC::LocalTensor<float> interpreSrc = srcTensor.template ReinterpretCast<float>();
+            AscendC::LocalTensor<float> interpreDst = dstTensor.template ReinterpretCast<float>();
+            AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+            AscendC::PipeBarrier<PIPE_V>();
+
+            uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+            // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次
+            for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+                uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+                AscendC::WholeReduceSum(
+                    interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride);
+            }
+            AscendC::PipeBarrier<PIPE_V>();
+            AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+        }
+    }
 };
 } // namespace ATVC
 
diff --git a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h
index 612b1d0e3a5eccb8a935f6232715eec6f843154f..adcd122d0e9ad47826790e0ba01d289402c5dd26 100644
--- a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h
+++ b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h
@@ -19,8 +19,9 @@
 #include "reduce_util.h"
 #include "kernel_operator.h"
 #include "common/platform.h"
-#include "common/ops_utils.h"
+#include "common/ops_utils_device.h"
 #include "reduce/common/patterns.h"
+#include "reduce/common/reduce_common.h"
 
 
 namespace ATVC {
diff --git a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h
index 01c63adaaa0391efa278c47d1dfb3d10790cdb5f..f27ec468f374b11d7d1f389fa156a327cb71a5fa 100644
--- a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h
+++ b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h
@@ -19,7 +19,7 @@
 
 #include "kernel_operator.h"
 #include "common/platform.h"
-#include "common/ops_utils.h"
+#include "common/ops_utils_device.h"
 
 namespace ATVC {
 namespace KernelUtils {
diff --git a/atvc/include/reduce/reduce_utils/reduce_util.h b/atvc/include/reduce/reduce_utils/reduce_util.h
index 082cad8732ed5a36bf7e41febf869c641df85e82..556f2867ae4367a9a20c164643fb8c3f8de047a5 100644
--- a/atvc/include/reduce/reduce_utils/reduce_util.h
+++ b/atvc/include/reduce/reduce_utils/reduce_util.h
@@ -27,20 +27,20 @@ struct Shape {
     int64_t oriBurstLen;
 };
 
-template <typename DTYPE_X>
+template <typename T>
 __aicore__ inline constexpr int32_t GetCopyInCount()
 {
-    if constexpr (AscendC::IsSameType<DTYPE_X, bfloat16_t>::value || AscendC::IsSameType<DTYPE_X, half>::value) {
+    if constexpr (AscendC::IsSameType<T, bfloat16_t>::value || AscendC::IsSameType<T, half>::value) {
         return CONST2;
     } else {
         return CONST3;
     }
 }
 
-template <typename DTYPE_X>
+template <typename T>
 __aicore__ inline constexpr int32_t GetComputeCount()
 {
-    if constexpr (AscendC::IsSameType<DTYPE_X, bfloat16_t>::value || AscendC::IsSameType<DTYPE_X, half>::value) {
+    if constexpr (AscendC::IsSameType<T, bfloat16_t>::value || AscendC::IsSameType<T, half>::value) {
         return CONST2;
     } else {
         return CONST0;
diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h
index f649485c8e7db518c5383d53222893b16a653bcf..92d90df69dbb3db49d1a6a78af342e9818ca06fa 100644
--- a/atvc/include/reduce/tiling/reduce_tiling.h
+++ b/atvc/include/reduce/tiling/reduce_tiling.h
@@ -17,58 +17,10 @@
 #include "graph/types.h"
 #include "common/compile_info.h"
 #include "common/const_def.h"
-#include "reduce/common/patterns.h"
-#include "common/ops_utils.h"
-
-namespace {
-constexpr static int32_t BASIC_BLOCK = 48 * 1024;
-constexpr static int32_t CACHE_SIZE = 16 * 1024;              // cahce size for ub reduce
-constexpr static int32_t MAX_INNER_A = 128;
-constexpr static double THRES_HOLD = 0.85;
-constexpr static int32_t A_STEP_LEN = 4;
-
-struct ReduceTilingUnit {
-    int32_t idx = -1;    // ub cut axis
-    uint64_t inner = 1;  // inner size in ub
-    uint64_t outer = 1;  // outer size of ub
-    uint64_t step = 1;   // step of cacheline
-    void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step)
-    {
-        this->idx = idx;
-        this->inner = inner;
-        this->outer = outer;
-        this->step = step;
-    }
-};
-
-struct CacheLineBlock {
-    int32_t axis = -1;            // cacheline cut axis
-    uint64_t size = 1;            // cacheline size
-    uint64_t cacheLineStep = 1;   // cacheline cut size for axis
-    uint64_t cacheLineOuter = 1;  // relative to cacheLineStep, out size of cacheline cut axis
-    uint64_t aSize = 1;           // A axis size in cacheline
-};
-
-struct ReduceTilingInputParam {
-    std::vector<int64_t> reduceDim = {};
-    std::vector<int64_t> reduceShape = {};
-    ge::DataType inputDtype = ge::DataType::DT_UNDEFINED;
-    ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED;
-};
-}
+#include "common/ops_utils_host.h"
+#include "tiling_common.h"
 
 namespace OpTiling {
-static void MakeWrapDim(const std::vector<int64_t>& shape, std::vector<int64_t>& axes)
-{
-    // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0
-    size_t shapeSize = shape.size();
-    for (size_t i = 0; i < axes.size(); i++) {
-        if (axes[i] < 0) {
-            axes[i] += shapeSize;
-        }
-    }
-}
-
 class ReduceOpTiling {
 public:
 ReduceOpTiling(ReduceTilingInputParam& inputParam,
@@ -157,24 +109,6 @@ void CalcWorkSpace()
     param_->workspaceSize = ATVC::WORKSPACE_SIZE + spaceSize;
 }
 
-int32_t IsAxesValid(const std::vector<int64_t>& shape, const std::vector<int64_t>& axes)
-{
-    size_t shapeSize = shape.size();
-    size_t axesSize = axes.size();
-    if (axesSize > shapeSize) {
-        printf("[ERROR] axis size is greater than shape size\n");
-        return -1;
-    };
-
-    for (size_t i = 0; i < axesSize; i++) {
-        if (axes[i] >= static_cast<int64_t>(shapeSize) || axes[i] < 0) {
-            printf("[ERROR] axis size incorrect \n");
-            return -1;
-        };
-    }
-    return 0;
-}
-
 void EliminateOne(const std::vector<int64_t>& oriShape, std::vector<int64_t>& axes, std::vector<uint64_t>& shape,
                                   int32_t& shapeSize)
 {
@@ -324,16 +258,6 @@ void CalcBasicBlock()
     }
 }
 
-template <class Pattern>
-bool IsEmtpyTensor(const std::vector<uint64_t>& shape)
-{
-    for (int32_t i = 0; i < Pattern::Dim; i++) {
-        if (shape[i] == 0) {
-            return true;
-        }
-    }
-    return false;
-}
 template <class Pattern>
 bool ComputeEmptyTiling(std::vector<uint64_t>& shape)
 {
@@ -442,15 +366,6 @@ void ComputeSplit(std::vector<uint64_t>& shape)
                         OpsUtils::CeilDiv(unitR_.outer, factorRCntPerCore);
     param_->tilingData.coreNum = realCore;
 }
-template <class Pattern>
-bool IsAxisA(int32_t idx)
-{
-    if (Pattern::FirstA) {
-        return idx % ATVC::CONST2 == 0;
-    } else {
-        return idx % ATVC::CONST2 == 1;
-    }
-}
 
 template <class Pattern>
 bool CalcCacheLineStep(const std::vector<uint64_t>& shape)
@@ -593,7 +508,7 @@ private:
     int32_t basicBlock_ = 0;
     ATVC::ReduceParam* param_ {nullptr};
     ATVC::ReducePolicy* policy_ {nullptr};
-    ATVC::OpCompileInfo compileInfo_;
+    ATVC::OpCompileInfo compileInfo_ = {0, 0, 0, 0};
     CacheLineBlock cBlock_;
     ReduceTilingUnit unitA_;
     ReduceTilingUnit unitR_;
diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f96757a6026c739c14ebc007d9bf9112ebf433f
--- /dev/null
+++ b/atvc/include/reduce/tiling/tiling_common.h
@@ -0,0 +1,107 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+ #ifndef ATVC_TILING_COMMON_H
+#define ATVC_TILING_COMMON_H
+#include "reduce/common/patterns.h"
+
+namespace OpTiling {
+constexpr static int32_t BASIC_BLOCK = 48 * 1024;
+constexpr static int32_t CACHE_SIZE = 16 * 1024;              // cahce size for ub reduce
+constexpr static int32_t MAX_INNER_A = 128;
+constexpr static double THRES_HOLD = 0.85;
+constexpr static int32_t A_STEP_LEN = 4;
+
+struct ReduceTilingUnit {
+    int32_t idx = -1;    // ub cut axis
+    uint64_t inner = 1;  // inner size in ub
+    uint64_t outer = 1;  // outer size of ub
+    uint64_t step = 1;   // step of cacheline
+    void Update(int32_t idx, uint64_t inner, uint64_t outer, uint64_t step)
+    {
+        this->idx = idx;
+        this->inner = inner;
+        this->outer = outer;
+        this->step = step;
+    }
+};
+
+struct CacheLineBlock {
+    int32_t axis = -1;            // cacheline cut axis
+    uint64_t size = 1;            // cacheline size
+    uint64_t cacheLineStep = 1;   // cacheline cut size for axis
+    uint64_t cacheLineOuter = 1;  // relative to cacheLineStep, out size of cacheline cut axis
+    uint64_t aSize = 1;           // A axis size in cacheline
+};
+
+struct ReduceTilingInputParam {
+    std::vector<int64_t> reduceDim = {};
+    std::vector<int64_t> reduceShape = {};
+    ge::DataType inputDtype = ge::DataType::DT_UNDEFINED;
+    ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED;
+    ReduceTilingInputParam(std::vector<int64_t> reduceDim_, std::vector<int64_t> reduceShape_, 
+            ge::DataType inputDtype_, ge::DataType promoteDtpye_):
+        reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_){}
+};
+
+void MakeWrapDim(const std::vector<int64_t>& shape, std::vector<int64_t>& axes)
+{
+    // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0
+    size_t shapeSize = shape.size();
+    for (size_t i = 0; i < axes.size(); i++) {
+        if (axes[i] < 0) {
+            axes[i] += shapeSize;
+        }
+    }
+}
+
+template <class Pattern>
+bool IsAxisA(int32_t idx)
+{
+    if (Pattern::FirstA) {
+        return idx % ATVC::CONST2 == 0;
+    } else {
+        return idx % ATVC::CONST2 == 1;
+    }
+}
+
+int32_t IsAxesValid(const std::vector<int64_t>& shape, const std::vector<int64_t>& axes)
+{
+    size_t shapeSize = shape.size();
+    size_t axesSize = axes.size();
+    if (axesSize > shapeSize) {
+        printf("[ERROR] axis size is greater than shape size\n");
+        return -1;
+    };
+
+    for (size_t i = 0; i < axesSize; i++) {
+        if (axes[i] >= static_cast<int64_t>(shapeSize) || axes[i] < 0) {
+            printf("[ERROR] axis size incorrect \n");
+            return -1;
+        };
+    }
+    return 0;
+}
+
+template <class Pattern>
+bool IsEmtpyTensor(const std::vector<uint64_t>& shape)
+{
+    for (int32_t i = 0; i < Pattern::Dim; i++) {
+        if (shape[i] == 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+}; // namespace OpTiling
+
+#endif // ATVC_TILING_COMMON_H
\ No newline at end of file
diff --git a/atvc/tests/run_test.sh b/atvc/tests/run_test.sh
index 5c291c0eb16777075aec3e95e577266aae0b9856..667e24b6d14d8b883d253c477e896f8ab198fea6 100644
--- a/atvc/tests/run_test.sh
+++ b/atvc/tests/run_test.sh
@@ -23,7 +23,7 @@ else
 fi
 
 ATVC_HOME_DIR=$CURRENT_DIR/../
-TEST_CASE_LIST="add add_with_scalar reduce_sum sinh_custom broadcast_to"
+TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples|xargs)
 if [ $# -ne 1 ]; then
     echo "This script takes only one test case name as input. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'"
     exit 1

算子类型(OpType)	Add
算子输入	name	shape	data type	format
	x	8 * 2048	float,int	ND
	y	8 * 2048	float,int	ND
算子输出	z	8 * 2048	float,int	ND
核函数名	add_custom
算子类型(OpType)	ReduceSum
算子输入	name	shape	data type	format
算子输入	x	8 * 2048	float,int	ND
算子输出	y	1 * 2048	float,int	ND
核函数名	reduce_sum_custom